diff options
author | Martin Matuska <mm@FreeBSD.org> | 2022-05-18 22:54:40 +0000 |
---|---|---|
committer | Martin Matuska <mm@FreeBSD.org> | 2022-05-18 22:55:59 +0000 |
commit | 716fd348e01c5f2ba125f878a634a753436c2994 (patch) | |
tree | 0d738baf7a9ccfd90fa1e622f67e0399f306f024 /sys/contrib/openzfs/module | |
parent | 4e2d3f26bd12610ef8672eefb02814b882a4c29b (diff) | |
parent | c0cf6ed6792e545fd614c2a88cb53756db7e03f8 (diff) | |
download | src-716fd348e01c5f2ba125f878a634a753436c2994.tar.gz src-716fd348e01c5f2ba125f878a634a753436c2994.zip |
zfs: merge openzfs/zfs@c0cf6ed67
Notable upstream pull request merges:
#10662 zvol_wait: Ignore locked zvols
#12789 Improve log spacemap load time
#12812 Improved zpool status output, list all affected datasets
#13277 FreeBSD: Use NDFREE_PNBUF if available
#13302 Make zfs_max_recordsize default to 16M
#13311 Fix error handling in FreeBSD's get/putpages VOPs
#13345 FreeBSD: Fix translation from ABD to physical pages
#13373 zfs: holds: dequadratify
#13375 Corrected edge case in uncompressed ARC->L2ARC handling
#13388 Improve mg_aliquot math
#13405 Reduce dbuf_find() lock contention
#13406 FreeBSD: use zero_region instead of allocating a dedicated page
Obtained from: OpenZFS
OpenZFS commit: c0cf6ed6792e545fd614c2a88cb53756db7e03f8
Diffstat (limited to 'sys/contrib/openzfs/module')
65 files changed, 2063 insertions, 1069 deletions
diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in index 1507965c5750..11099999fb87 100644 --- a/sys/contrib/openzfs/module/Kbuild.in +++ b/sys/contrib/openzfs/module/Kbuild.in @@ -1,20 +1,6 @@ # When integrated in to a monolithic kernel the spl module must appear # first. This ensures its module initialization function is run before # any of the other module initialization functions which depend on it. -ZFS_MODULES += spl/ -ZFS_MODULES += avl/ -ZFS_MODULES += icp/ -ZFS_MODULES += lua/ -ZFS_MODULES += nvpair/ -ZFS_MODULES += unicode/ -ZFS_MODULES += zcommon/ -ZFS_MODULES += zfs/ -ZFS_MODULES += zstd/ - -# The rest is only relevant when run by kbuild -ifneq ($(KERNELRELEASE),) - -obj-$(CONFIG_ZFS) := $(ZFS_MODULES) ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement ZFS_MODULE_CFLAGS += -Wmissing-prototypes @@ -22,10 +8,16 @@ ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @NO_FORMAT_ZERO_LENGTH@ ifneq ($(KBUILD_EXTMOD),) zfs_include = @abs_top_srcdir@/include +icp_include = @abs_srcdir@/icp/include +zstd_include = @abs_srcdir@/zstd/include ZFS_MODULE_CFLAGS += -include @abs_top_builddir@/zfs_config.h ZFS_MODULE_CFLAGS += -I@abs_top_builddir@/include +src = @abs_srcdir@ +obj = @abs_builddir@ else zfs_include = $(srctree)/include/zfs +icp_include = $(srctree)/$(src)/icp/include +zstd_include = $(srctree)/$(src)/zstd/include ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h endif @@ -36,12 +28,415 @@ ZFS_MODULE_CFLAGS += -I$(zfs_include) ZFS_MODULE_CPPFLAGS += -D_KERNEL ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ +# KASAN enables -Werror=frame-larger-than=1024, which +# breaks oh so many parts of our build. +ifeq ($(CONFIG_KASAN),y) +ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than= +endif + ifneq ($(KBUILD_EXTMOD),) @CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include @CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@ endif -subdir-asflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) -subdir-ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) +asflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) +ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) + +# Suppress unused-value warnings in sparc64 architecture headers +ccflags-$(CONFIG_SPARC64) += -Wno-unused-value + + +obj-$(CONFIG_ZFS) := spl.o zfs.o + +SPL_OBJS := \ + spl-atomic.o \ + spl-condvar.o \ + spl-cred.o \ + spl-err.o \ + spl-generic.o \ + spl-kmem-cache.o \ + spl-kmem.o \ + spl-kstat.o \ + spl-proc.o \ + spl-procfs-list.o \ + spl-taskq.o \ + spl-thread.o \ + spl-trace.o \ + spl-tsd.o \ + spl-vmem.o \ + spl-xdr.o \ + spl-zlib.o + +spl-objs += $(addprefix os/linux/spl/,$(SPL_OBJS)) + +zfs-objs += avl/avl.o + +ICP_OBJS := \ + algs/aes/aes_impl.o \ + algs/aes/aes_impl_generic.o \ + algs/aes/aes_modes.o \ + algs/edonr/edonr.o \ + algs/modes/cbc.o \ + algs/modes/ccm.o \ + algs/modes/ctr.o \ + algs/modes/ecb.o \ + algs/modes/gcm.o \ + algs/modes/gcm_generic.o \ + algs/modes/modes.o \ + algs/sha2/sha2.o \ + algs/skein/skein.o \ + algs/skein/skein_block.o \ + algs/skein/skein_iv.o \ + api/kcf_cipher.o \ + api/kcf_ctxops.o \ + api/kcf_mac.o \ + core/kcf_callprov.o \ + core/kcf_mech_tabs.o \ + core/kcf_prov_lib.o \ + core/kcf_prov_tabs.o \ + core/kcf_sched.o \ + illumos-crypto.o \ + io/aes.o \ + io/sha2_mod.o \ + io/skein_mod.o \ + spi/kcf_spi.o + +ICP_OBJS_X86_64 := \ + asm-x86_64/aes/aes_aesni.o \ + asm-x86_64/aes/aes_amd64.o \ + asm-x86_64/aes/aeskey.o \ + asm-x86_64/modes/aesni-gcm-x86_64.o \ + asm-x86_64/modes/gcm_pclmulqdq.o \ + asm-x86_64/modes/ghash-x86_64.o \ + asm-x86_64/sha2/sha256_impl.o \ + asm-x86_64/sha2/sha512_impl.o + +ICP_OBJS_X86 := \ + algs/aes/aes_impl_aesni.o \ + algs/aes/aes_impl_x86-64.o \ + algs/modes/gcm_pclmulqdq.o + +zfs-objs += $(addprefix icp/,$(ICP_OBJS)) +zfs-$(CONFIG_X86) += $(addprefix icp/,$(ICP_OBJS_X86)) +zfs-$(CONFIG_X86_64) += $(addprefix icp/,$(ICP_OBJS_X86_64)) + +$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64)) : asflags-y += -I$(icp_include) +$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64)) : ccflags-y += -I$(icp_include) + +# Suppress objtool "can't find jump dest instruction at" warnings. They +# are caused by the constants which are defined in the text section of the +# assembly file using .byte instructions (e.g. bswap_mask). The objtool +# utility tries to interpret them as opcodes and obviously fails doing so. +OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y +OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y +# Suppress objtool "unsupported stack pointer realignment" warnings. We are +# not using a DRAP register while aligning the stack to a 64 byte boundary. +# See #6950 for the reasoning. +OBJECT_FILES_NON_STANDARD_sha256_impl.o := y +OBJECT_FILES_NON_STANDARD_sha512_impl.o := y + + +LUA_OBJS := \ + lapi.o \ + lauxlib.o \ + lbaselib.o \ + lcode.o \ + lcompat.o \ + lcorolib.o \ + lctype.o \ + ldebug.o \ + ldo.o \ + lfunc.o \ + lgc.o \ + llex.o \ + lmem.o \ + lobject.o \ + lopcodes.o \ + lparser.o \ + lstate.o \ + lstring.o \ + lstrlib.o \ + ltable.o \ + ltablib.o \ + ltm.o \ + lvm.o \ + lzio.o \ + setjmp/setjmp.o + +zfs-objs += $(addprefix lua/,$(LUA_OBJS)) + + +NVPAIR_OBJS := \ + fnvpair.o \ + nvpair.o \ + nvpair_alloc_fixed.o \ + nvpair_alloc_spl.o + +zfs-objs += $(addprefix nvpair/,$(NVPAIR_OBJS)) + + +UNICODE_OBJS := \ + u8_textprep.o \ + uconv.o + +zfs-objs += $(addprefix unicode/,$(UNICODE_OBJS)) + + +ZCOMMON_OBJS := \ + cityhash.o \ + zfeature_common.o \ + zfs_comutil.o \ + zfs_deleg.o \ + zfs_fletcher.o \ + zfs_fletcher_superscalar.o \ + zfs_fletcher_superscalar4.o \ + zfs_namecheck.o \ + zfs_prop.o \ + zpool_prop.o \ + zprop_common.o + +ZCOMMON_OBJS_X86 := \ + zfs_fletcher_avx512.o \ + zfs_fletcher_intel.o \ + zfs_fletcher_sse.o + +ZCOMMON_OBJS_ARM64 := \ + zfs_fletcher_aarch64_neon.o + +zfs-objs += $(addprefix zcommon/,$(ZCOMMON_OBJS)) +zfs-$(CONFIG_X86) += $(addprefix zcommon/,$(ZCOMMON_OBJS_X86)) +zfs-$(CONFIG_ARM64) += $(addprefix zcommon/,$(ZCOMMON_OBJS_ARM64)) + + +# Zstd uses -O3 by default, so we should follow +ZFS_ZSTD_FLAGS := -O3 + +# -fno-tree-vectorize gets set for gcc in zstd/common/compiler.h +# Set it for other compilers, too. +ZFS_ZSTD_FLAGS += -fno-tree-vectorize + +# SSE register return with SSE disabled if -march=znverX is passed +ZFS_ZSTD_FLAGS += -U__BMI__ + +# Quiet warnings about frame size due to unused code in unmodified zstd lib +ZFS_ZSTD_FLAGS += -Wframe-larger-than=20480 + +ZSTD_OBJS := \ + zfs_zstd.o \ + zstd_sparc.o + +ZSTD_UPSTREAM_OBJS := \ + lib/common/entropy_common.o \ + lib/common/error_private.o \ + lib/common/fse_decompress.o \ + lib/common/pool.o \ + lib/common/zstd_common.o \ + lib/compress/fse_compress.o \ + lib/compress/hist.o \ + lib/compress/huf_compress.o \ + lib/compress/zstd_compress.o \ + lib/compress/zstd_compress_literals.o \ + lib/compress/zstd_compress_sequences.o \ + lib/compress/zstd_compress_superblock.o \ + lib/compress/zstd_double_fast.o \ + lib/compress/zstd_fast.o \ + lib/compress/zstd_lazy.o \ + lib/compress/zstd_ldm.o \ + lib/compress/zstd_opt.o \ + lib/decompress/huf_decompress.o \ + lib/decompress/zstd_ddict.o \ + lib/decompress/zstd_decompress.o \ + lib/decompress/zstd_decompress_block.o + +zfs-objs += $(addprefix zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) + +# Disable aarch64 neon SIMD instructions for kernel mode +$(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -I$(zstd_include) $(ZFS_ZSTD_FLAGS) +$(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : asflags-y += -I$(zstd_include) +$(addprefix $(obj)/zstd/,$(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -include $(zstd_include)/aarch64_compat.h -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w +$(obj)/zstd/zfs_zstd.o : ccflags-y += -include $(zstd_include)/zstd_compat_wrapper.h + + +ZFS_OBJS := \ + abd.o \ + aggsum.o \ + arc.o \ + blkptr.o \ + bplist.o \ + bpobj.o \ + bptree.o \ + bqueue.o \ + btree.o \ + dataset_kstats.o \ + dbuf.o \ + dbuf_stats.o \ + ddt.o \ + ddt_zap.o \ + dmu.o \ + dmu_diff.o \ + dmu_object.o \ + dmu_objset.o \ + dmu_recv.o \ + dmu_redact.o \ + dmu_send.o \ + dmu_traverse.o \ + dmu_tx.o \ + dmu_zfetch.o \ + dnode.o \ + dnode_sync.o \ + dsl_bookmark.o \ + dsl_crypt.o \ + dsl_dataset.o \ + dsl_deadlist.o \ + dsl_deleg.o \ + dsl_destroy.o \ + dsl_dir.o \ + dsl_pool.o \ + dsl_prop.o \ + dsl_scan.o \ + dsl_synctask.o \ + dsl_userhold.o \ + edonr_zfs.o \ + fm.o \ + gzip.o \ + hkdf.o \ + lz4.o \ + lz4_zfs.o \ + lzjb.o \ + metaslab.o \ + mmp.o \ + multilist.o \ + objlist.o \ + pathname.o \ + range_tree.o \ + refcount.o \ + rrwlock.o \ + sa.o \ + sha256.o \ + skein_zfs.o \ + spa.o \ + spa_boot.o \ + spa_checkpoint.o \ + spa_config.o \ + spa_errlog.o \ + spa_history.o \ + spa_log_spacemap.o \ + spa_misc.o \ + spa_stats.o \ + space_map.o \ + space_reftree.o \ + txg.o \ + uberblock.o \ + unique.o \ + vdev.o \ + vdev_cache.o \ + vdev_draid.o \ + vdev_draid_rand.o \ + vdev_indirect.o \ + vdev_indirect_births.o \ + vdev_indirect_mapping.o \ + vdev_initialize.o \ + vdev_label.o \ + vdev_mirror.o \ + vdev_missing.o \ + vdev_queue.o \ + vdev_raidz.o \ + vdev_raidz_math.o \ + vdev_raidz_math_scalar.o \ + vdev_rebuild.o \ + vdev_removal.o \ + vdev_root.o \ + vdev_trim.o \ + zap.o \ + zap_leaf.o \ + zap_micro.o \ + zcp.o \ + zcp_get.o \ + zcp_global.o \ + zcp_iter.o \ + zcp_set.o \ + zcp_synctask.o \ + zfeature.o \ + zfs_byteswap.o \ + zfs_fm.o \ + zfs_fuid.o \ + zfs_ioctl.o \ + zfs_log.o \ + zfs_onexit.o \ + zfs_quota.o \ + zfs_ratelimit.o \ + zfs_replay.o \ + zfs_rlock.o \ + zfs_sa.o \ + zfs_vnops.o \ + zil.o \ + zio.o \ + zio_checksum.o \ + zio_compress.o \ + zio_inject.o \ + zle.o \ + zrlock.o \ + zthr.o \ + zvol.o + +ZFS_OBJS_OS := \ + abd_os.o \ + arc_os.o \ + mmp_os.o \ + policy.o \ + qat.o \ + qat_compress.o \ + qat_crypt.o \ + spa_misc_os.o \ + trace.o \ + vdev_disk.o \ + vdev_file.o \ + zfs_acl.o \ + zfs_ctldir.o \ + zfs_debug.o \ + zfs_dir.o \ + zfs_file_os.o \ + zfs_ioctl_os.o \ + zfs_racct.o \ + zfs_sysfs.o \ + zfs_uio.o \ + zfs_vfsops.o \ + zfs_vnops_os.o \ + zfs_znode.o \ + zio_crypt.o \ + zpl_ctldir.o \ + zpl_export.o \ + zpl_file.o \ + zpl_inode.o \ + zpl_super.o \ + zpl_xattr.o \ + zvol_os.o + +ZFS_OBJS_X86 := \ + vdev_raidz_math_avx2.o \ + vdev_raidz_math_avx512bw.o \ + vdev_raidz_math_avx512f.o \ + vdev_raidz_math_sse2.o \ + vdev_raidz_math_ssse3.o + +ZFS_OBJS_ARM64 := \ + vdev_raidz_math_aarch64_neon.o \ + vdev_raidz_math_aarch64_neonx2.o + +ZFS_OBJS_PPC_PPC64 := \ + vdev_raidz_math_powerpc_altivec.o + +zfs-objs += $(addprefix zfs/,$(ZFS_OBJS)) $(addprefix os/linux/zfs/,$(ZFS_OBJS_OS)) +zfs-$(CONFIG_X86) += $(addprefix zfs/,$(ZFS_OBJS_X86)) +zfs-$(CONFIG_ARM64) += $(addprefix zfs/,$(ZFS_OBJS_ARM64)) +zfs-$(CONFIG_PPC) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64)) +zfs-$(CONFIG_PPC64) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64)) + +# Suppress incorrect warnings from versions of objtool which are not +# aware of x86 EVEX prefix instructions used for AVX512. +OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y +OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y +ifeq ($(CONFIG_ALTIVEC),y) +$(obj)/zfs/vdev_raidz_math_powerpc_altivec.o : c_flags += -maltivec endif diff --git a/sys/contrib/openzfs/module/Makefile.in b/sys/contrib/openzfs/module/Makefile.in index 762f9394dd20..5b71e1abf79e 100644 --- a/sys/contrib/openzfs/module/Makefile.in +++ b/sys/contrib/openzfs/module/Makefile.in @@ -3,19 +3,19 @@ include Kbuild INSTALL_MOD_DIR ?= extra INSTALL_MOD_PATH ?= $(DESTDIR) -SUBDIR_TARGETS = icp lua zstd - all: modules distclean maintainer-clean: clean -install: modules_install -uninstall: modules_uninstall +install: modules_install data_install +uninstall: modules_uninstall data_uninstall check: .PHONY: all distclean maintainer-clean install uninstall check distdir \ modules modules-Linux modules-FreeBSD modules-unknown \ clean clean-Linux clean-FreeBSD \ modules_install modules_install-Linux modules_install-FreeBSD \ + data_install data_install-Linux data_install-FreeBSD \ modules_uninstall modules_uninstall-Linux modules_uninstall-FreeBSD \ + data_uninstall data_uninstall-Linux data_uninstall-FreeBSD \ cppcheck cppcheck-Linux cppcheck-FreeBSD # For FreeBSD, use debug options from ./configure if not overridden. @@ -51,7 +51,8 @@ endif FMAKE = env -u MAKEFLAGS make $(FMAKEFLAGS) modules-Linux: - list='$(SUBDIR_TARGETS)'; for td in $$list; do $(MAKE) -C $$td; done + mkdir -p $(sort $(dir $(spl-objs) $(spl-))) + mkdir -p $(sort $(dir $(zfs-objs) $(zfs-))) $(MAKE) -C @LINUX_OBJ@ $(if @KERNEL_CC@,CC=@KERNEL_CC@) \ $(if @KERNEL_LD@,LD=@KERNEL_LD@) $(if @KERNEL_LLVM@,LLVM=@KERNEL_LLVM@) \ M="$$PWD" @KERNEL_MAKE@ CONFIG_ZFS=m modules @@ -77,16 +78,20 @@ clean-FreeBSD: clean: clean-@ac_system@ -modules_install-Linux: +.PHONY: modules_uninstall-Linux-legacy +modules_uninstall-Linux-legacy: + $(RM) -r $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,spl/ avl/ icp/ lua/ nvpair/ unicode/ zcommon/ zfs/ zstd/) + +KMODDIR := $(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@ +modules_install-Linux: modules_uninstall-Linux-legacy @# Install the kernel modules $(MAKE) -C @LINUX_OBJ@ M="$$PWD" modules_install \ INSTALL_MOD_PATH=$(INSTALL_MOD_PATH) \ INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) \ KERNELRELEASE=@LINUX_VERSION@ @# Remove extraneous build products when packaging - kmoddir=$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \ if [ -n "$(DESTDIR)" ]; then \ - find $$kmoddir -name 'modules.*' -delete; \ + find $(KMODDIR) -name 'modules.*' -delete; \ fi @# Debian ships tiny fake System.map files that are @# syntactically valid but just say @@ -107,18 +112,32 @@ modules_install-FreeBSD: modules_install: modules_install-@ac_system@ -modules_uninstall-Linux: +data_install-Linux: + @mkdir -p $(DESTDIR)/@prefix@/src/zfs-@VERSION@/@LINUX_VERSION@ + cp ../zfs.release ../zfs_config.h @LINUX_SYMBOLS@ $(DESTDIR)/@prefix@/src/zfs-@VERSION@/@LINUX_VERSION@ + +data_install-FreeBSD: + @ + +data_install: data_install-@ac_system@ + +modules_uninstall-Linux: modules_uninstall-Linux-legacy @# Uninstall the kernel modules - kmoddir=$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \ - for objdir in $(ZFS_MODULES); do \ - $(RM) -R $$kmoddir/$(INSTALL_MOD_DIR)/$$objdir; \ - done + $(RM) $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,zfs.ko spl.ko) modules_uninstall-FreeBSD: @false modules_uninstall: modules_uninstall-@ac_system@ +data_uninstall-Linux: + $(RM) $(addprefix $(DESTDIR)/@prefix@/src/zfs-@VERSION@/@LINUX_VERSION@/,zfs.release zfs_config.h @LINUX_SYMBOLS@) + +data_uninstall-FreeBSD: + @ + +data_uninstall: data_uninstall-@ac_system@ + cppcheck-Linux: @CPPCHECK@ -j@CPU_COUNT@ --std=c99 --quiet --force --error-exitcode=2 \ --inline-suppr \ @@ -126,7 +145,7 @@ cppcheck-Linux: --suppress=noValidConfiguration \ --enable=warning,information -D_KERNEL \ --include=@LINUX_OBJ@/include/generated/autoconf.h \ - --include=@top_srcdir@/zfs_config.h \ + --include=@top_builddir@/zfs_config.h \ --config-exclude=@LINUX_OBJ@/include \ -i zstd/lib \ -I @LINUX_OBJ@/include \ @@ -134,7 +153,7 @@ cppcheck-Linux: -I @top_srcdir@/include/os/linux/spl \ -I @top_srcdir@/include/os/linux/zfs \ -I @top_srcdir@/include \ - avl icp lua nvpair spl unicode zcommon zfs zstd os/linux + avl icp lua nvpair unicode zcommon zfs zstd os/linux cppcheck-FreeBSD: @true @@ -142,9 +161,11 @@ cppcheck-FreeBSD: cppcheck: cppcheck-@ac_system@ distdir: - (cd @srcdir@ && find $(ZFS_MODULES) os -name '*.[chS]') | \ - while read path; do \ - mkdir -p $$distdir/$${path%/*}; \ - cp @srcdir@/$$path $$distdir/$$path; \ - done; \ + cd @srcdir@ && find . -name '*.[chS]' -exec sh -c 'for f; do mkdir -p $$distdir/$${f%/*}; cp @srcdir@/$$f $$distdir/$$f; done' _ {} + cp @srcdir@/Makefile.bsd $$distdir/Makefile.bsd + +gen-zstd-symbols: + for obj in $(addprefix zstd/,$(ZSTD_UPSTREAM_OBJS)); do echo; echo "/* $${obj#zstd/}: */"; @OBJDUMP@ -t $$obj | awk '$$2 == "g" && !/ zfs_/ {print "#define\t" $$6 " zfs_" $$6}' | sort; done >> zstd/include/zstd_compat_wrapper.h + +check-zstd-symbols: + @OBJDUMP@ -t $(addprefix zstd/,$(ZSTD_UPSTREAM_OBJS)) | awk '/file format/ {print} $$2 == "g" && !/ zfs_/ {++ret; print} END {exit ret}' diff --git a/sys/contrib/openzfs/module/avl/Makefile.in b/sys/contrib/openzfs/module/avl/Makefile.in deleted file mode 100644 index 991d5f95b8c0..000000000000 --- a/sys/contrib/openzfs/module/avl/Makefile.in +++ /dev/null @@ -1,10 +0,0 @@ -ifneq ($(KBUILD_EXTMOD),) -src = @abs_srcdir@ -obj = @abs_builddir@ -endif - -MODULE := zavl - -obj-$(CONFIG_ZFS) := $(MODULE).o - -$(MODULE)-objs += avl.o diff --git a/sys/contrib/openzfs/module/avl/avl.c b/sys/contrib/openzfs/module/avl/avl.c index 3891a2d62880..69cb8bf6815b 100644 --- a/sys/contrib/openzfs/module/avl/avl.c +++ b/sys/contrib/openzfs/module/avl/avl.c @@ -1044,28 +1044,6 @@ done: return (AVL_NODE2DATA(node, off)); } -#if defined(_KERNEL) - -static int __init -avl_init(void) -{ - return (0); -} - -static void __exit -avl_fini(void) -{ -} - -module_init(avl_init); -module_exit(avl_fini); -#endif - -ZFS_MODULE_DESCRIPTION("Generic AVL tree implementation"); -ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); -ZFS_MODULE_LICENSE(ZFS_META_LICENSE); -ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); - EXPORT_SYMBOL(avl_create); EXPORT_SYMBOL(avl_find); EXPORT_SYMBOL(avl_insert); diff --git a/sys/contrib/openzfs/module/icp/Makefile.in b/sys/contrib/openzfs/module/icp/Makefile.in deleted file mode 100644 index 72c9ab12adb7..000000000000 --- a/sys/contrib/openzfs/module/icp/Makefile.in +++ /dev/null @@ -1,90 +0,0 @@ -ifneq ($(KBUILD_EXTMOD),) -src = @abs_srcdir@ -obj = @abs_builddir@ -icp_include = $(src)/include -else -icp_include = $(srctree)/$(src)/include -endif - -MODULE := icp - -obj-$(CONFIG_ZFS) := $(MODULE).o - -asflags-y := -I$(icp_include) -ccflags-y := -I$(icp_include) - -$(MODULE)-objs += illumos-crypto.o -$(MODULE)-objs += api/kcf_cipher.o -$(MODULE)-objs += api/kcf_mac.o -$(MODULE)-objs += api/kcf_ctxops.o -$(MODULE)-objs += core/kcf_callprov.o -$(MODULE)-objs += core/kcf_prov_tabs.o -$(MODULE)-objs += core/kcf_sched.o -$(MODULE)-objs += core/kcf_mech_tabs.o -$(MODULE)-objs += core/kcf_prov_lib.o -$(MODULE)-objs += spi/kcf_spi.o -$(MODULE)-objs += io/aes.o -$(MODULE)-objs += io/sha2_mod.o -$(MODULE)-objs += io/skein_mod.o -$(MODULE)-objs += algs/modes/cbc.o -$(MODULE)-objs += algs/modes/ccm.o -$(MODULE)-objs += algs/modes/ctr.o -$(MODULE)-objs += algs/modes/ecb.o -$(MODULE)-objs += algs/modes/gcm_generic.o -$(MODULE)-objs += algs/modes/gcm.o -$(MODULE)-objs += algs/modes/modes.o -$(MODULE)-objs += algs/aes/aes_impl_generic.o -$(MODULE)-objs += algs/aes/aes_impl.o -$(MODULE)-objs += algs/aes/aes_modes.o -$(MODULE)-objs += algs/edonr/edonr.o -$(MODULE)-objs += algs/sha2/sha2.o -$(MODULE)-objs += algs/skein/skein.o -$(MODULE)-objs += algs/skein/skein_block.o -$(MODULE)-objs += algs/skein/skein_iv.o - -$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aeskey.o -$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aes_amd64.o -$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aes_aesni.o -$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/gcm_pclmulqdq.o -$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/aesni-gcm-x86_64.o -$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/ghash-x86_64.o -$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/sha2/sha256_impl.o -$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/sha2/sha512_impl.o - -$(MODULE)-$(CONFIG_X86) += algs/modes/gcm_pclmulqdq.o -$(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_aesni.o -$(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_x86-64.o - -# Suppress objtool "can't find jump dest instruction at" warnings. They -# are caused by the constants which are defined in the text section of the -# assembly file using .byte instructions (e.g. bswap_mask). The objtool -# utility tries to interpret them as opcodes and obviously fails doing so. -OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y -OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y -# Suppress objtool "unsupported stack pointer realignment" warnings. We are -# not using a DRAP register while aligning the stack to a 64 byte boundary. -# See #6950 for the reasoning. -OBJECT_FILES_NON_STANDARD_sha256_impl.o := y -OBJECT_FILES_NON_STANDARD_sha512_impl.o := y - -ICP_DIRS = \ - api \ - core \ - spi \ - io \ - os \ - algs \ - algs/aes \ - algs/edonr \ - algs/modes \ - algs/sha2 \ - algs/skein \ - asm-x86_64 \ - asm-x86_64/aes \ - asm-x86_64/modes \ - asm-x86_64/sha2 \ - asm-i386 \ - asm-generic - -all: - mkdir -p $(ICP_DIRS) diff --git a/sys/contrib/openzfs/module/icp/algs/edonr/edonr.c b/sys/contrib/openzfs/module/icp/algs/edonr/edonr.c index 6f3a43e263be..9388a6f6b7c9 100644 --- a/sys/contrib/openzfs/module/icp/algs/edonr/edonr.c +++ b/sys/contrib/openzfs/module/icp/algs/edonr/edonr.c @@ -47,10 +47,7 @@ #define hashState384(x) ((x)->pipe->p512) #define hashState512(x) ((x)->pipe->p512) -/* shift and rotate shortcuts */ -#define shl(x, n) ((x) << n) -#define shr(x, n) ((x) >> n) - +/* rotate shortcuts */ #define rotl32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) #define rotr32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) diff --git a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c index e666b45b5f44..ee2100b7f425 100644 --- a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c +++ b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c @@ -806,7 +806,7 @@ static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; * fallback to the fastest generic implementation. */ const gcm_impl_ops_t * -gcm_impl_get_ops() +gcm_impl_get_ops(void) { if (!kfpu_allowed()) return (&gcm_generic_impl); diff --git a/sys/contrib/openzfs/module/icp/illumos-crypto.c b/sys/contrib/openzfs/module/icp/illumos-crypto.c index f68f6bc765a2..d17b90e7200a 100644 --- a/sys/contrib/openzfs/module/icp/illumos-crypto.c +++ b/sys/contrib/openzfs/module/icp/illumos-crypto.c @@ -104,7 +104,7 @@ * ZFS Makefiles. */ -void __exit +void icp_fini(void) { skein_mod_fini(); @@ -139,10 +139,7 @@ icp_init(void) return (0); } -#if defined(_KERNEL) +#if defined(_KERNEL) && defined(__FreeBSD__) module_exit(icp_fini); module_init(icp_init); -MODULE_AUTHOR(ZFS_META_AUTHOR); -MODULE_LICENSE(ZFS_META_LICENSE); -MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); #endif diff --git a/sys/contrib/openzfs/module/lua/Makefile.in b/sys/contrib/openzfs/module/lua/Makefile.in deleted file mode 100644 index 0a74c17e64e8..000000000000 --- a/sys/contrib/openzfs/module/lua/Makefile.in +++ /dev/null @@ -1,39 +0,0 @@ -ifneq ($(KBUILD_EXTMOD),) -src = @abs_srcdir@ -obj = @abs_builddir@ -endif - -MODULE := zlua - -obj-$(CONFIG_ZFS) := $(MODULE).o - -ccflags-y := -DLUA_USE_LONGLONG - -$(MODULE)-objs += lapi.o -$(MODULE)-objs += lauxlib.o -$(MODULE)-objs += lbaselib.o -$(MODULE)-objs += lcode.o -$(MODULE)-objs += lcompat.o -$(MODULE)-objs += lcorolib.o -$(MODULE)-objs += lctype.o -$(MODULE)-objs += ldebug.o -$(MODULE)-objs += ldo.o -$(MODULE)-objs += lfunc.o -$(MODULE)-objs += lgc.o -$(MODULE)-objs += llex.o -$(MODULE)-objs += lmem.o -$(MODULE)-objs += lobject.o -$(MODULE)-objs += lopcodes.o -$(MODULE)-objs += lparser.o -$(MODULE)-objs += lstate.o -$(MODULE)-objs += lstring.o -$(MODULE)-objs += lstrlib.o -$(MODULE)-objs += ltable.o -$(MODULE)-objs += ltablib.o -$(MODULE)-objs += ltm.o -$(MODULE)-objs += lvm.o -$(MODULE)-objs += lzio.o -$(MODULE)-objs += setjmp/setjmp.o - -all: - mkdir -p setjmp diff --git a/sys/contrib/openzfs/module/lua/lapi.c b/sys/contrib/openzfs/module/lua/lapi.c index 72b0037aa9a9..726e5c2ad4bb 100644 --- a/sys/contrib/openzfs/module/lua/lapi.c +++ b/sys/contrib/openzfs/module/lua/lapi.c @@ -1278,29 +1278,6 @@ LUA_API void lua_upvaluejoin (lua_State *L, int fidx1, int n1, luaC_objbarrier(L, f1, *up2); } -#if defined(_KERNEL) - -static int __init -lua_init(void) -{ - return (0); -} - -static void __exit -lua_fini(void) -{ -} - -module_init(lua_init); -module_exit(lua_fini); - -#endif - -ZFS_MODULE_DESCRIPTION("Lua Interpreter for ZFS"); -ZFS_MODULE_AUTHOR("Lua.org"); -ZFS_MODULE_LICENSE("Dual MIT/GPL"); -ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); - EXPORT_SYMBOL(lua_absindex); EXPORT_SYMBOL(lua_atpanic); EXPORT_SYMBOL(lua_checkstack); diff --git a/sys/contrib/openzfs/module/nvpair/Makefile.in b/sys/contrib/openzfs/module/nvpair/Makefile.in deleted file mode 100644 index d8145236674b..000000000000 --- a/sys/contrib/openzfs/module/nvpair/Makefile.in +++ /dev/null @@ -1,13 +0,0 @@ -ifneq ($(KBUILD_EXTMOD),) -src = @abs_srcdir@ -obj = @abs_builddir@ -endif - -MODULE := znvpair - -obj-$(CONFIG_ZFS) := $(MODULE).o - -$(MODULE)-objs += nvpair.o -$(MODULE)-objs += fnvpair.o -$(MODULE)-objs += nvpair_alloc_spl.o -$(MODULE)-objs += nvpair_alloc_fixed.o diff --git a/sys/contrib/openzfs/module/nvpair/nvpair.c b/sys/contrib/openzfs/module/nvpair/nvpair.c index a5222dac7849..a442990dade0 100644 --- a/sys/contrib/openzfs/module/nvpair/nvpair.c +++ b/sys/contrib/openzfs/module/nvpair/nvpair.c @@ -3678,27 +3678,6 @@ nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) return (err); } -#if defined(_KERNEL) -static int __init -nvpair_init(void) -{ - return (0); -} - -static void __exit -nvpair_fini(void) -{ -} - -module_init(nvpair_init); -module_exit(nvpair_fini); -#endif - -ZFS_MODULE_DESCRIPTION("Generic name/value pair implementation"); -ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); -ZFS_MODULE_LICENSE(ZFS_META_LICENSE); -ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); - EXPORT_SYMBOL(nv_alloc_init); EXPORT_SYMBOL(nv_alloc_reset); EXPORT_SYMBOL(nv_alloc_fini); diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c index 0354b986cd5f..e46271a039de 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c @@ -43,15 +43,11 @@ static struct opensolaris_utsname hw_utsname = { .machine = MACHINE }; -#ifndef KERNEL_STATIC -char hw_serial[11] = "0"; - utsname_t * utsname(void) { return (&hw_utsname); } -#endif static void opensolaris_utsname_init(void *arg) diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c index ebec77bdb37f..2a3c027c9389 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c @@ -46,19 +46,6 @@ ddi_strtol(const char *str, char **nptr, int base, long *result) } int -ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result) -{ - - if (str == hw_serial) { - *result = prison0.pr_hostid; - return (0); - } - - *result = strtoul(str, nptr, base); - return (0); -} - -int ddi_strtoull(const char *str, char **nptr, int base, unsigned long long *result) { diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c index 914e0e6ded66..1ac41f616a0d 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c @@ -1131,8 +1131,12 @@ vdev_geom_fill_unmap_cb(void *buf, size_t len, void *priv) vm_offset_t addr = (vm_offset_t)buf; vm_offset_t end = addr + len; - if (bp->bio_ma_n == 0) + if (bp->bio_ma_n == 0) { bp->bio_ma_offset = addr & PAGE_MASK; + addr &= ~PAGE_MASK; + } else { + ASSERT0(P2PHASE(addr, PAGE_SIZE)); + } do { bp->bio_ma[bp->bio_ma_n++] = PHYS_TO_VM_PAGE(pmap_kextract(addr)); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c index e33aaea481b1..e57855770293 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -97,6 +97,10 @@ VFS_SMR_DECLARE; +#if __FreeBSD_version < 1300103 +#define NDFREE_PNBUF(ndp) NDFREE((ndp), NDF_ONLY_PNBUF) +#endif + #if __FreeBSD_version >= 1300047 #define vm_page_wire_lock(pp) #define vm_page_wire_unlock(pp) @@ -237,7 +241,7 @@ zfs_open(vnode_t **vpp, int flag, cred_t *cr) } /* Keep a count of the synchronous opens in the znode */ - if (flag & (FSYNC | FDSYNC)) + if (flag & O_SYNC) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); @@ -255,7 +259,7 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ - if ((flag & (FSYNC | FDSYNC)) && (count == 1)) + if ((flag & O_SYNC) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); @@ -4036,8 +4040,8 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int pgsin_b, pgsin_a; int error; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + ZFS_ENTER_ERROR(zfsvfs, zfs_vm_pagerret_error); + ZFS_VERIFY_ZP_ERROR(zp, zfs_vm_pagerret_error); start = IDX_TO_OFF(ma[0]->pindex); end = IDX_TO_OFF(ma[count - 1]->pindex + 1); @@ -4161,19 +4165,18 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int err; int i; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - object = vp->v_object; - pcount = btoc(len); - ncount = pcount; - KASSERT(ma[0]->object == object, ("mismatching object")); KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); + pcount = btoc(len); + ncount = pcount; for (i = 0; i < pcount; i++) rtvals[i] = zfs_vm_pagerret_error; + ZFS_ENTER_ERROR(zfsvfs, zfs_vm_pagerret_error); + ZFS_VERIFY_ZP_ERROR(zp, zfs_vm_pagerret_error); + off = IDX_TO_OFF(ma[0]->pindex); blksz = zp->z_blksz; lo_off = rounddown(off, blksz); @@ -4399,11 +4402,11 @@ ioflags(int ioflags) int flags = 0; if (ioflags & IO_APPEND) - flags |= FAPPEND; + flags |= O_APPEND; if (ioflags & IO_NDELAY) - flags |= FNONBLOCK; + flags |= O_NONBLOCK; if (ioflags & IO_SYNC) - flags |= (FSYNC | FDSYNC | FRSYNC); + flags |= O_SYNC; return (flags); } @@ -4624,7 +4627,7 @@ zfs_freebsd_create(struct vop_create_args *ap) zfsvfs = ap->a_dvp->v_mount->mnt_data; *ap->a_vpp = NULL; - rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, !EXCL, mode, + rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, 0, mode, &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */); if (rc == 0) *ap->a_vpp = ZTOV(zp); @@ -5447,7 +5450,7 @@ zfs_getextattr(struct vop_getextattr_args *ap) error = ENOENT; ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp) + ZFS_VERIFY_ZP(zp); rw_enter(&zp->z_xattr_lock, RW_READER); error = zfs_getextattr_impl(ap, zfs_xattr_compat); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c index 2496d6897d9a..877b7187b676 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c @@ -153,6 +153,9 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_vnode = NULL; + zp->z_sync_writes_cnt = 0; + zp->z_async_writes_cnt = 0; + return (0); } @@ -172,6 +175,9 @@ zfs_znode_cache_destructor(void *buf, void *arg) ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); + + ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); + ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } @@ -453,6 +459,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; + zp->z_sync_writes_cnt = 0; + zp->z_async_writes_cnt = 0; #if __FreeBSD_version >= 1300139 atomic_store_ptr(&zp->z_cached_symlink, NULL); #endif diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 487778472e79..1011aaf68ac6 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -311,15 +311,13 @@ retry: err = SET_ERROR(EBUSY); goto out_opened; } -#ifdef FEXCL - if (flag & FEXCL) { + if (flag & O_EXCL) { if (zv->zv_open_count != 0) { err = SET_ERROR(EBUSY); goto out_opened; } zv->zv_flags |= ZVOL_EXCL; } -#endif zv->zv_open_count += count; out_opened: @@ -952,18 +950,16 @@ retry: err = SET_ERROR(EBUSY); goto out_opened; } -#ifdef FEXCL - if (flags & FEXCL) { + if (flags & O_EXCL) { if (zv->zv_open_count != 0) { err = SET_ERROR(EBUSY); goto out_opened; } zv->zv_flags |= ZVOL_EXCL; } -#endif zv->zv_open_count++; - if (flags & (FSYNC | FDSYNC)) { + if (flags & O_SYNC) { zsd = &zv->zv_zso->zso_dev; zsd->zsd_sync_cnt++; if (zsd->zsd_sync_cnt == 1 && @@ -1037,7 +1033,7 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) * You may get multiple opens, but only one close. */ zv->zv_open_count--; - if (flags & (FSYNC | FDSYNC)) { + if (flags & O_SYNC) { zsd = &zv->zv_zso->zso_dev; zsd->zsd_sync_cnt--; } diff --git a/sys/contrib/openzfs/module/os/linux/spl/Makefile.in b/sys/contrib/openzfs/module/os/linux/spl/Makefile.in deleted file mode 100644 index b2325f91b4a7..000000000000 --- a/sys/contrib/openzfs/module/os/linux/spl/Makefile.in +++ /dev/null @@ -1,17 +0,0 @@ -$(MODULE)-objs += ../os/linux/spl/spl-atomic.o -$(MODULE)-objs += ../os/linux/spl/spl-condvar.o -$(MODULE)-objs += ../os/linux/spl/spl-cred.o -$(MODULE)-objs += ../os/linux/spl/spl-err.o -$(MODULE)-objs += ../os/linux/spl/spl-generic.o -$(MODULE)-objs += ../os/linux/spl/spl-kmem.o -$(MODULE)-objs += ../os/linux/spl/spl-kmem-cache.o -$(MODULE)-objs += ../os/linux/spl/spl-kstat.o -$(MODULE)-objs += ../os/linux/spl/spl-proc.o -$(MODULE)-objs += ../os/linux/spl/spl-procfs-list.o -$(MODULE)-objs += ../os/linux/spl/spl-taskq.o -$(MODULE)-objs += ../os/linux/spl/spl-thread.o -$(MODULE)-objs += ../os/linux/spl/spl-trace.o -$(MODULE)-objs += ../os/linux/spl/spl-tsd.o -$(MODULE)-objs += ../os/linux/spl/spl-vmem.o -$(MODULE)-objs += ../os/linux/spl/spl-xdr.o -$(MODULE)-objs += ../os/linux/spl/spl-zlib.o diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c index cc9a973fef62..f99a2f966660 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c @@ -425,22 +425,33 @@ EXPORT_SYMBOL(__aeabi_ldivmod); * functions against their Solaris counterparts. It is possible that I * may have misinterpreted the man page or the man page is incorrect. */ -int ddi_strtoul(const char *, char **, int, unsigned long *); int ddi_strtol(const char *, char **, int, long *); int ddi_strtoull(const char *, char **, int, unsigned long long *); int ddi_strtoll(const char *, char **, int, long long *); -#define define_ddi_strtoux(type, valtype) \ -int ddi_strtou##type(const char *str, char **endptr, \ +#define define_ddi_strtox(type, valtype) \ +int ddi_strto##type(const char *str, char **endptr, \ int base, valtype *result) \ { \ valtype last_value, value = 0; \ char *ptr = (char *)str; \ - int flag = 1, digit; \ + int digit, minus = 0; \ + \ + while (strchr(" \t\n\r\f", *ptr)) \ + ++ptr; \ \ if (strlen(ptr) == 0) \ return (EINVAL); \ \ + switch (*ptr) { \ + case '-': \ + minus = 1; \ + zfs_fallthrough; \ + case '+': \ + ++ptr; \ + break; \ + } \ + \ /* Auto-detect base based on prefix */ \ if (!base) { \ if (str[0] == '0') { \ @@ -474,46 +485,21 @@ int ddi_strtou##type(const char *str, char **endptr, \ if (last_value > value) /* Overflow */ \ return (ERANGE); \ \ - flag = 1; \ ptr++; \ } \ \ - if (flag) \ - *result = value; \ + *result = minus ? -value : value; \ \ if (endptr) \ - *endptr = (char *)(flag ? ptr : str); \ + *endptr = ptr; \ \ return (0); \ } \ -#define define_ddi_strtox(type, valtype) \ -int ddi_strto##type(const char *str, char **endptr, \ - int base, valtype *result) \ -{ \ - int rc; \ - \ - if (*str == '-') { \ - rc = ddi_strtou##type(str + 1, endptr, base, result); \ - if (!rc) { \ - if (*endptr == str + 1) \ - *endptr = (char *)str; \ - else \ - *result = -*result; \ - } \ - } else { \ - rc = ddi_strtou##type(str, endptr, base, result); \ - } \ - \ - return (rc); \ -} - -define_ddi_strtoux(l, unsigned long) define_ddi_strtox(l, long) -define_ddi_strtoux(ll, unsigned long long) +define_ddi_strtox(ull, unsigned long long) define_ddi_strtox(ll, long long) -EXPORT_SYMBOL(ddi_strtoul); EXPORT_SYMBOL(ddi_strtol); EXPORT_SYMBOL(ddi_strtoll); EXPORT_SYMBOL(ddi_strtoull); @@ -828,7 +814,7 @@ spl_fini(void) module_init(spl_init); module_exit(spl_fini); -ZFS_MODULE_DESCRIPTION("Solaris Porting Layer"); -ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); -ZFS_MODULE_LICENSE("GPL"); -ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); +MODULE_DESCRIPTION("Solaris Porting Layer"); +MODULE_AUTHOR(ZFS_META_AUTHOR); +MODULE_LICENSE("GPL"); +MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c index bb2b56880646..33aaad653dc8 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c @@ -1420,7 +1420,7 @@ EXPORT_SYMBOL(spl_kmem_cache_reap_now); * it should do no harm. */ int -spl_kmem_cache_reap_active() +spl_kmem_cache_reap_active(void) { return (0); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in b/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in deleted file mode 100644 index fa990776db83..000000000000 --- a/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in +++ /dev/null @@ -1,38 +0,0 @@ -# -# Linux specific sources included from module/zfs/Makefile.in -# - -# Suppress unused-value warnings in sparc64 architecture headers -ccflags-$(CONFIG_SPARC64) += -Wno-unused-value - -$(MODULE)-objs += ../os/linux/zfs/abd_os.o -$(MODULE)-objs += ../os/linux/zfs/arc_os.o -$(MODULE)-objs += ../os/linux/zfs/mmp_os.o -$(MODULE)-objs += ../os/linux/zfs/policy.o -$(MODULE)-objs += ../os/linux/zfs/trace.o -$(MODULE)-objs += ../os/linux/zfs/qat.o -$(MODULE)-objs += ../os/linux/zfs/qat_compress.o -$(MODULE)-objs += ../os/linux/zfs/qat_crypt.o -$(MODULE)-objs += ../os/linux/zfs/spa_misc_os.o -$(MODULE)-objs += ../os/linux/zfs/vdev_disk.o -$(MODULE)-objs += ../os/linux/zfs/vdev_file.o -$(MODULE)-objs += ../os/linux/zfs/zfs_acl.o -$(MODULE)-objs += ../os/linux/zfs/zfs_ctldir.o -$(MODULE)-objs += ../os/linux/zfs/zfs_debug.o -$(MODULE)-objs += ../os/linux/zfs/zfs_dir.o -$(MODULE)-objs += ../os/linux/zfs/zfs_file_os.o -$(MODULE)-objs += ../os/linux/zfs/zfs_ioctl_os.o -$(MODULE)-objs += ../os/linux/zfs/zfs_racct.o -$(MODULE)-objs += ../os/linux/zfs/zfs_sysfs.o -$(MODULE)-objs += ../os/linux/zfs/zfs_uio.o -$(MODULE)-objs += ../os/linux/zfs/zfs_vfsops.o -$(MODULE)-objs += ../os/linux/zfs/zfs_vnops_os.o -$(MODULE)-objs += ../os/linux/zfs/zfs_znode.o -$(MODULE)-objs += ../os/linux/zfs/zio_crypt.o -$(MODULE)-objs += ../os/linux/zfs/zpl_ctldir.o -$(MODULE)-objs += ../os/linux/zfs/zpl_export.o -$(MODULE)-objs += ../os/linux/zfs/zpl_file.o -$(MODULE)-objs += ../os/linux/zfs/zpl_inode.o -$(MODULE)-objs += ../os/linux/zfs/zpl_super.o -$(MODULE)-objs += ../os/linux/zfs/zpl_xattr.o -$(MODULE)-objs += ../os/linux/zfs/zvol_os.o diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index 688458621b93..0cd4fa5213d4 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -620,7 +620,6 @@ abd_alloc_zero_scatter(void) ABD_SCATTER(abd_zero_scatter).abd_offset = 0; ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; - zfs_refcount_create(&abd_zero_scatter->abd_children); ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages * sizeof (struct scatterlist), KM_SLEEP); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index 6cec5be44012..235cd1691c14 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -467,8 +467,11 @@ vdev_submit_bio_impl(struct bio *bio) * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). * As a side effect the function was converted to GPL-only. Define our * own version when needed which uses rcu_read_lock_sched(). + * + * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public + * part, moving blkg_tryget into the private one. Define our own version. */ -#if defined(HAVE_BLKG_TRYGET_GPL_ONLY) +#if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) static inline bool vdev_blkg_tryget(struct blkcg_gq *blkg) { @@ -493,7 +496,7 @@ vdev_blkg_tryget(struct blkcg_gq *blkg) return (rc); } -#elif defined(HAVE_BLKG_TRYGET) +#else #define vdev_blkg_tryget(bg) blkg_tryget(bg) #endif #ifdef HAVE_BIO_SET_DEV_MACRO diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c index 351e4dad799c..b70691ab31c1 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c @@ -863,6 +863,26 @@ zfs_unix_to_v4(uint32_t access_mask) return (new_mask); } + +static int +zfs_v4_to_unix(uint32_t access_mask, int *unmapped) +{ + int new_mask = 0; + + *unmapped = access_mask & + (ACE_WRITE_OWNER | ACE_WRITE_ACL | ACE_DELETE); + + if (access_mask & WRITE_MASK) + new_mask |= S_IWOTH; + if (access_mask & ACE_READ_DATA) + new_mask |= S_IROTH; + if (access_mask & ACE_EXECUTE) + new_mask |= S_IXOTH; + + return (new_mask); +} + + static void zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, uint16_t access_type, uint64_t fuid, uint16_t entry_type) @@ -2399,6 +2419,53 @@ zfs_has_access(znode_t *zp, cred_t *cr) return (B_TRUE); } +/* + * Simplified access check for case where ACL is known to not contain + * information beyond what is defined in the mode. In this case, we + * can pass along to the kernel / vfs generic_permission() check, which + * evaluates the mode and POSIX ACL. + * + * NFSv4 ACLs allow granting permissions that are usually relegated only + * to the file owner or superuser. Examples are ACE_WRITE_OWNER (chown), + * ACE_WRITE_ACL(chmod), and ACE_DELETE. ACE_DELETE requests must fail + * because with conventional posix permissions, right to delete file + * is determined by write bit on the parent dir. + * + * If unmappable perms are requested, then we must return EPERM + * and include those bits in the working_mode so that the caller of + * zfs_zaccess_common() can decide whether to perform additional + * policy / capability checks. EACCES is used in zfs_zaccess_aces_check() + * to indicate access check failed due to explicit DENY entry, and so + * we want to avoid that here. + */ +static int +zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr) +{ + int err, mask; + int unmapped = 0; + + ASSERT(zp->z_pflags & ZFS_ACL_TRIVIAL); + + mask = zfs_v4_to_unix(*working_mode, &unmapped); + if (mask == 0 || unmapped) { + *working_mode = unmapped; + return (unmapped ? SET_ERROR(EPERM) : 0); + } + +#if defined(HAVE_IOPS_PERMISSION_USERNS) + err = generic_permission(cr->user_ns, ZTOI(zp), mask); +#else + err = generic_permission(ZTOI(zp), mask); +#endif + if (err != 0) { + return (SET_ERROR(EPERM)); + } + + *working_mode = unmapped; + + return (0); +} + static int zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) @@ -2450,6 +2517,9 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, return (SET_ERROR(EPERM)); } + if (zp->z_pflags & ZFS_ACL_TRIVIAL) + return (zfs_zaccess_trivial(zp, working_mode, cr)); + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c index f7e71461a3bd..aae19f6346fd 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c @@ -162,7 +162,7 @@ zfsctl_snapshot_free(zfs_snapentry_t *se) zfs_refcount_destroy(&se->se_refcount); kmem_strfree(se->se_name); kmem_strfree(se->se_path); - rw_destroy(se->se_taskqid_lock); + rw_destroy(&se->se_taskqid_lock); kmem_free(se, sizeof (zfs_snapentry_t)); } @@ -496,6 +496,8 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, zp->z_pflags = 0; zp->z_mode = 0; zp->z_sync_cnt = 0; + zp->z_sync_writes_cnt = 0; + zp->z_async_writes_cnt = 0; ip->i_generation = 0; ip->i_ino = id; ip->i_mode = (S_IFDIR | S_IRWXUGO); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c index fee3fe540b90..c65702e1a053 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c @@ -58,6 +58,8 @@ #include <sys/zvol.h> #include <sys/fm/util.h> #include <sys/dsl_crypt.h> +#include <sys/crypto/icp.h> +#include <sys/zstd/zstd.h> #include <sys/zfs_ioctl_impl.h> @@ -233,8 +235,8 @@ zfsdev_detach(void) #define ZFS_DEBUG_STR "" #endif -static int __init -openzfs_init(void) +static int +openzfs_init_os(void) { int error; @@ -259,8 +261,8 @@ openzfs_init(void) return (0); } -static void __exit -openzfs_fini(void) +static void +openzfs_fini_os(void) { zfs_sysfs_fini(); zfs_kmod_fini(); @@ -269,12 +271,59 @@ openzfs_fini(void) ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR); } + +extern int __init zcommon_init(void); +extern void zcommon_fini(void); + +static int __init +openzfs_init(void) +{ + int err; + if ((err = zcommon_init()) != 0) + goto zcommon_failed; + if ((err = icp_init()) != 0) + goto icp_failed; + if ((err = zstd_init()) != 0) + goto zstd_failed; + if ((err = openzfs_init_os()) != 0) + goto openzfs_os_failed; + return (0); + +openzfs_os_failed: + zstd_fini(); +zstd_failed: + icp_fini(); +icp_failed: + zcommon_fini(); +zcommon_failed: + return (err); +} + +static void __exit +openzfs_fini(void) +{ + openzfs_fini_os(); + zstd_fini(); + icp_fini(); + zcommon_fini(); +} + #if defined(_KERNEL) module_init(openzfs_init); module_exit(openzfs_fini); #endif -ZFS_MODULE_DESCRIPTION("ZFS"); -ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); -ZFS_MODULE_LICENSE(ZFS_META_LICENSE); -ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); +MODULE_ALIAS("zavl"); +MODULE_ALIAS("icp"); +MODULE_ALIAS("zlua"); +MODULE_ALIAS("znvpair"); +MODULE_ALIAS("zunicode"); +MODULE_ALIAS("zcommon"); +MODULE_ALIAS("zzstd"); +MODULE_DESCRIPTION("ZFS"); +MODULE_AUTHOR(ZFS_META_AUTHOR); +MODULE_LICENSE("Lua: MIT"); +MODULE_LICENSE("zstd: Dual BSD/GPL"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_LICENSE(ZFS_META_LICENSE); +MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c index 6f71382cf74e..eb7c5f6166d2 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c @@ -65,16 +65,15 @@ /* * A zfs_mod_kobj_t represents a zfs kobject under '/sys/module/zfs' */ -struct zfs_mod_kobj; typedef struct zfs_mod_kobj zfs_mod_kobj_t; - struct zfs_mod_kobj { struct kobject zko_kobj; struct kobj_type zko_kobj_type; struct sysfs_ops zko_sysfs_ops; size_t zko_attr_count; struct attribute *zko_attr_list; /* allocated */ - struct attribute **zko_default_attrs; /* allocated */ + struct attribute_group zko_default_group; /* .attrs allocated */ + const struct attribute_group *zko_default_groups[2]; size_t zko_child_count; zfs_mod_kobj_t *zko_children; /* allocated */ }; @@ -127,10 +126,10 @@ zfs_kobj_release(struct kobject *kobj) zkobj->zko_attr_list = NULL; } - if (zkobj->zko_default_attrs != NULL) { - kmem_free(zkobj->zko_default_attrs, + if (zkobj->zko_default_group.attrs != NULL) { + kmem_free(zkobj->zko_default_group.attrs, DEFAULT_ATTR_SIZE(zkobj->zko_attr_count)); - zkobj->zko_default_attrs = NULL; + zkobj->zko_default_group.attrs = NULL; } if (zkobj->zko_child_count != 0) { @@ -154,11 +153,12 @@ zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name) { VERIFY3U(attr_num, <, zkobj->zko_attr_count); ASSERT(zkobj->zko_attr_list); - ASSERT(zkobj->zko_default_attrs); + ASSERT(zkobj->zko_default_group.attrs); zkobj->zko_attr_list[attr_num].name = attr_name; zkobj->zko_attr_list[attr_num].mode = 0444; - zkobj->zko_default_attrs[attr_num] = &zkobj->zko_attr_list[attr_num]; + zkobj->zko_default_group.attrs[attr_num] = + &zkobj->zko_attr_list[attr_num]; sysfs_attr_init(&zkobj->zko_attr_list[attr_num]); } @@ -176,9 +176,9 @@ zfs_kobj_init(zfs_mod_kobj_t *zkobj, int attr_cnt, int child_cnt, return (ENOMEM); } /* this will always have at least one slot for NULL termination */ - zkobj->zko_default_attrs = kmem_zalloc(DEFAULT_ATTR_SIZE(attr_cnt), - KM_SLEEP); - if (zkobj->zko_default_attrs == NULL) { + zkobj->zko_default_group.attrs = + kmem_zalloc(DEFAULT_ATTR_SIZE(attr_cnt), KM_SLEEP); + if (zkobj->zko_default_group.attrs == NULL) { if (zkobj->zko_attr_list != NULL) { kmem_free(zkobj->zko_attr_list, ATTR_TABLE_SIZE(attr_cnt)); @@ -186,14 +186,19 @@ zfs_kobj_init(zfs_mod_kobj_t *zkobj, int attr_cnt, int child_cnt, return (ENOMEM); } zkobj->zko_attr_count = attr_cnt; - zkobj->zko_kobj_type.default_attrs = zkobj->zko_default_attrs; + zkobj->zko_default_groups[0] = &zkobj->zko_default_group; +#ifdef HAVE_SYSFS_DEFAULT_GROUPS + zkobj->zko_kobj_type.default_groups = zkobj->zko_default_groups; +#else + zkobj->zko_kobj_type.default_attrs = zkobj->zko_default_group.attrs; +#endif if (child_cnt > 0) { zkobj->zko_children = kmem_zalloc(CHILD_TABLE_SIZE(child_cnt), KM_SLEEP); if (zkobj->zko_children == NULL) { - if (zkobj->zko_default_attrs != NULL) { - kmem_free(zkobj->zko_default_attrs, + if (zkobj->zko_default_group.attrs != NULL) { + kmem_free(zkobj->zko_default_group.attrs, DEFAULT_ATTR_SIZE(attr_cnt)); } if (zkobj->zko_attr_list != NULL) { @@ -215,9 +220,9 @@ zfs_kobj_init(zfs_mod_kobj_t *zkobj, int attr_cnt, int child_cnt, static int zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name) { - /* zko_default_attrs must be NULL terminated */ - ASSERT(zkobj->zko_default_attrs != NULL); - ASSERT(zkobj->zko_default_attrs[zkobj->zko_attr_count] == NULL); + /* zko_default_group.attrs must be NULL terminated */ + ASSERT(zkobj->zko_default_group.attrs != NULL); + ASSERT(zkobj->zko_default_group.attrs[zkobj->zko_attr_count] == NULL); kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type); return (kobject_add(&zkobj->zko_kobj, parent, name)); @@ -226,7 +231,7 @@ zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name) /* * Each zfs property has these common attributes */ -static const char *zprop_attrs[] = { +static const char *const zprop_attrs[] = { "type", "readonly", "setonce", @@ -239,7 +244,7 @@ static const char *zprop_attrs[] = { #define ZFS_PROP_ATTR_COUNT ARRAY_SIZE(zprop_attrs) #define ZPOOL_PROP_ATTR_COUNT (ZFS_PROP_ATTR_COUNT - 1) -static const char *zprop_types[] = { +static const char *const zprop_types[] = { "number", "string", "index", @@ -250,7 +255,7 @@ typedef struct zfs_type_map { const char *ztm_name; } zfs_type_map_t; -static zfs_type_map_t type_map[] = { +static const zfs_type_map_t type_map[] = { {ZFS_TYPE_FILESYSTEM, "filesystem"}, {ZFS_TYPE_SNAPSHOT, "snapshot"}, {ZFS_TYPE_VOLUME, "volume"}, @@ -371,7 +376,7 @@ pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf) * A user process can easily check if the running zfs kernel module * supports the new feature. */ -static const char *zfs_kernel_features[] = { +static const char *const zfs_kernel_features[] = { /* --> Add new kernel features here */ "com.delphix:vdev_initialize", "org.zfsonlinux:vdev_trim", @@ -439,7 +444,7 @@ zfs_kernel_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent) /* * Each pool feature has these common attributes */ -static const char *pool_feature_attrs[] = { +static const char *const pool_feature_attrs[] = { "description", "guid", "uname", diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c index ce47b3e6087a..4f31bcb5959d 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c @@ -248,7 +248,7 @@ zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio) /* touch each page in this segment. */ p = iov->iov_base + skip; while (cnt) { - if (get_user(tmp, (uint8_t *)p)) + if (copy_from_user(&tmp, p, 1)) return (EFAULT); ulong_t incr = MIN(cnt, PAGESIZE); p += incr; @@ -256,7 +256,7 @@ zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio) } /* touch the last byte in case it straddles a page. */ p--; - if (get_user(tmp, (uint8_t *)p)) + if (copy_from_user(&tmp, p, 1)) return (EFAULT); } } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c index ece7c373e852..d6ff838806eb 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c @@ -474,7 +474,7 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, */ if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, - B_FALSE, cr))) { + B_TRUE, cr))) { zrele(*zpp); *zpp = NULL; } @@ -3396,7 +3396,7 @@ top: } static void -zfs_putpage_commit_cb(void *arg) +zfs_putpage_sync_commit_cb(void *arg) { struct page *pp = arg; @@ -3404,13 +3404,26 @@ zfs_putpage_commit_cb(void *arg) end_page_writeback(pp); } +static void +zfs_putpage_async_commit_cb(void *arg) +{ + struct page *pp = arg; + znode_t *zp = ITOZ(pp->mapping->host); + + ClearPageError(pp); + end_page_writeback(pp); + atomic_dec_32(&zp->z_async_writes_cnt); +} + /* * Push a page out to disk, once the page is on stable storage the * registered commit callback will be run as notification of completion. * - * IN: ip - page mapped for inode. - * pp - page to push (page is locked) - * wbc - writeback control data + * IN: ip - page mapped for inode. + * pp - page to push (page is locked) + * wbc - writeback control data + * for_sync - does the caller intend to wait synchronously for the + * page writeback to complete? * * RETURN: 0 if success * error code if failure @@ -3419,7 +3432,8 @@ zfs_putpage_commit_cb(void *arg) * ip - ctime|mtime updated */ int -zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) +zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, + boolean_t for_sync) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); @@ -3517,6 +3531,16 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * Speed up any non-sync page writebacks since + * they may take several seconds to complete. + * Refer to the comment in zpl_fsync() (when + * HAVE_FSYNC_RANGE is defined) for details. + */ + if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { + zil_commit(zfsvfs->z_log, zp->z_id); + } + if (PageWriteback(pp)) #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT folio_wait_bit(page_folio(pp), PG_writeback); @@ -3542,6 +3566,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; + if (!for_sync) + atomic_inc_32(&zp->z_async_writes_cnt); set_page_writeback(pp); unlock_page(pp); @@ -3556,9 +3582,15 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) dmu_tx_wait(tx); dmu_tx_abort(tx); +#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO + filemap_dirty_folio(page_mapping(pp), page_folio(pp)); +#else __set_page_dirty_nobuffers(pp); +#endif ClearPageError(pp); end_page_writeback(pp); + if (!for_sync) + atomic_dec_32(&zp->z_async_writes_cnt); zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (err); @@ -3583,7 +3615,9 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, - zfs_putpage_commit_cb, pp); + for_sync ? zfs_putpage_sync_commit_cb : + zfs_putpage_async_commit_cb, pp); + dmu_tx_commit(tx); zfs_rangelock_exit(lr); @@ -3595,6 +3629,16 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) * performance reasons. */ zil_commit(zfsvfs->z_log, zp->z_id); + } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { + /* + * If the caller does not intend to wait synchronously + * for this page writeback to complete and there are active + * synchronous calls on this file, do a commit so that + * the latter don't accidentally end up waiting for + * our writeback to complete. Refer to the comment in + * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. + */ + zil_commit(zfsvfs->z_log, zp->z_id); } dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c index b76e65d16822..d921f2b07463 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c @@ -134,6 +134,9 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; + zp->z_sync_writes_cnt = 0; + zp->z_async_writes_cnt = 0; + return (0); } @@ -154,6 +157,9 @@ zfs_znode_cache_destructor(void *buf, void *arg) ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); + + ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); + ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } static int @@ -554,6 +560,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; + zp->z_sync_writes_cnt = 0; + zp->z_async_writes_cnt = 0; zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index f78e50262af7..8b84eb795fc3 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -33,9 +33,13 @@ #include <sys/zfs_vfsops.h> #include <sys/zfs_vnops.h> #include <sys/zfs_project.h> -#ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS +#if defined(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS) || \ + defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO) #include <linux/pagemap.h> #endif +#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO +#include <linux/writeback.h> +#endif /* * When using fallocate(2) to preallocate space, inflate the requested @@ -161,17 +165,56 @@ static int zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; + znode_t *zp = ITOZ(inode); + zfsvfs_t *zfsvfs = ITOZSB(inode); cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; + /* + * The variables z_sync_writes_cnt and z_async_writes_cnt work in + * tandem so that sync writes can detect if there are any non-sync + * writes going on and vice-versa. The "vice-versa" part to this logic + * is located in zfs_putpage() where non-sync writes check if there are + * any ongoing sync writes. If any sync and non-sync writes overlap, + * we do a commit to complete the non-sync writes since the latter can + * potentially take several seconds to complete and thus block sync + * writes in the upcoming call to filemap_write_and_wait_range(). + */ + atomic_inc_32(&zp->z_sync_writes_cnt); + /* + * If the following check does not detect an overlapping non-sync write + * (say because it's just about to start), then it is guaranteed that + * the non-sync write will detect this sync write. This is because we + * always increment z_sync_writes_cnt / z_async_writes_cnt before doing + * the check on z_async_writes_cnt / z_sync_writes_cnt here and in + * zfs_putpage() respectively. + */ + if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { + ZPL_ENTER(zfsvfs); + zil_commit(zfsvfs->z_log, zp->z_id); + ZPL_EXIT(zfsvfs); + } + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + + /* + * The sync write is not complete yet but we decrement + * z_sync_writes_cnt since zfs_fsync() increments and decrements + * it internally. If a non-sync write starts just after the decrement + * operation but before we call zfs_fsync(), it may not detect this + * overlapping sync write but it does not matter since we have already + * gone past filemap_write_and_wait_range() and we won't block due to + * the non-sync write. + */ + atomic_dec_32(&zp->z_sync_writes_cnt); + if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); - error = -zfs_fsync(ITOZ(inode), datasync, cr); + error = -zfs_fsync(zp, datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); @@ -413,6 +456,8 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, if (ret) return (ret); + kiocb->ki_pos = pos; + zfs_uio_t uio; zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, count, 0); @@ -647,24 +692,41 @@ zpl_readpage_filler(void *data, struct page *pp) * paging. For simplicity, the code relies on read_cache_pages() to * correctly lock each page for IO and call zpl_readpage(). */ +#ifdef HAVE_VFS_READPAGES static int zpl_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL)); } +#else +static void +zpl_readahead(struct readahead_control *ractl) +{ + struct page *page; + + while ((page = readahead_page(ractl)) != NULL) { + int ret; + + ret = zpl_readpage_filler(NULL, page); + put_page(page); + if (ret) + break; + } +} +#endif static int zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) { - struct address_space *mapping = data; + boolean_t *for_sync = data; fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); ASSERT(!PageWriteback(pp)); cookie = spl_fstrans_mark(); - (void) zfs_putpage(mapping->host, pp, wbc); + (void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync); spl_fstrans_unmark(cookie); return (0); @@ -691,8 +753,9 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) * we run it once in non-SYNC mode so that the ZIL gets all the data, * and then we commit it all in one go. */ + boolean_t for_sync = (sync_mode == WB_SYNC_ALL); wbc->sync_mode = WB_SYNC_NONE; - result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); + result = write_cache_pages(mapping, wbc, zpl_putpage, &for_sync); if (sync_mode != wbc->sync_mode) { ZPL_ENTER(zfsvfs); ZPL_VERIFY_ZP(zp); @@ -708,7 +771,8 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) * details). That being said, this is a no-op in most cases. */ wbc->sync_mode = sync_mode; - result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); + result = write_cache_pages(mapping, wbc, zpl_putpage, + &for_sync); } return (result); } @@ -725,7 +789,9 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc) if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; - return (zpl_putpage(pp, wbc, pp->mapping)); + boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL); + + return (zpl_putpage(pp, wbc, &for_sync)); } /* @@ -764,11 +830,13 @@ zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) if (mode & (test_mode)) { flock64_t bf; - if (offset > olen) - goto out_unmark; + if (mode & FALLOC_FL_KEEP_SIZE) { + if (offset > olen) + goto out_unmark; - if (offset + len > olen) - len = olen - offset; + if (offset + len > olen) + len = olen - offset; + } bf.l_type = F_WRLCK; bf.l_whence = SEEK_SET; bf.l_start = offset; @@ -1135,7 +1203,11 @@ zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) const struct address_space_operations zpl_address_space_operations = { +#ifdef HAVE_VFS_READPAGES .readpages = zpl_readpages, +#else + .readahead = zpl_readahead, +#endif .readpage = zpl_readpage, .writepage = zpl_writepage, .writepages = zpl_writepages, @@ -1143,6 +1215,9 @@ const struct address_space_operations zpl_address_space_operations = { #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS .set_page_dirty = __set_page_dirty_nobuffers, #endif +#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO + .dirty_folio = filemap_dirty_folio, +#endif }; const struct file_operations zpl_file_operations = { diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index c964cce0de9a..4ebdf8331695 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -46,7 +46,10 @@ static unsigned int zvol_request_sync = 0; static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; static unsigned int zvol_threads = 32; + +#ifndef HAVE_BLKDEV_GET_ERESTARTSYS static const unsigned int zvol_open_timeout_ms = 1000; +#endif struct zvol_state_os { struct gendisk *zvo_disk; /* generic disk */ @@ -903,22 +906,17 @@ zvol_alloc(dev_t dev, const char *name) zso->zvo_disk->major = zvol_major; zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; + /* + * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. + * This is accomplished by limiting the number of minors for the + * device to one and explicitly disabling partition scanning. + */ if (volmode == ZFS_VOLMODE_DEV) { - /* - * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set - * gendisk->minors = 1 as noted in include/linux/blkdev.h. - * Also disable extended partition numbers (GENHD_FL_EXT_DEVT) - * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN) - * setting gendisk->flags accordingly. - */ zso->zvo_disk->minors = 1; -#if defined(GENHD_FL_EXT_DEVT) - zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; -#endif -#if defined(GENHD_FL_NO_PART_SCAN) - zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN; -#endif + zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT; + zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART; } + zso->zvo_disk->first_minor = (dev & MINORMASK); zso->zvo_disk->fops = &zvol_ops; zso->zvo_disk->private_data = zv; diff --git a/sys/contrib/openzfs/module/spl/Makefile.in b/sys/contrib/openzfs/module/spl/Makefile.in deleted file mode 100644 index cedbfe92b58a..000000000000 --- a/sys/contrib/openzfs/module/spl/Makefile.in +++ /dev/null @@ -1,13 +0,0 @@ -ifneq ($(KBUILD_EXTMOD),) -src = @abs_srcdir@ -obj = @abs_builddir@ -mfdir = $(obj) -else -mfdir = $(srctree)/$(src) -endif - -MODULE := spl - -obj-$(CONFIG_ZFS) := $(MODULE).o - -include $(mfdir)/../os/linux/spl/Makefile diff --git a/sys/contrib/openzfs/module/unicode/Makefile.in b/sys/contrib/openzfs/module/unicode/Makefile.in deleted file mode 100644 index 59c07c4555b7..000000000000 --- a/sys/contrib/openzfs/module/unicode/Makefile.in +++ /dev/null @@ -1,11 +0,0 @@ -ifneq ($(KBUILD_EXTMOD),) -src = @abs_srcdir@ -obj = @abs_builddir@ -endif - -MODULE := zunicode - -obj-$(CONFIG_ZFS) := $(MODULE).o - -$(MODULE)-objs += u8_textprep.o -$(MODULE)-objs += uconv.o diff --git a/sys/contrib/openzfs/module/unicode/u8_textprep.c b/sys/contrib/openzfs/module/unicode/u8_textprep.c index b6b07b2453af..37d648b2172d 100644 --- a/sys/contrib/openzfs/module/unicode/u8_textprep.c +++ b/sys/contrib/openzfs/module/unicode/u8_textprep.c @@ -2129,27 +2129,6 @@ u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, return (ret_val); } -#if defined(_KERNEL) -static int __init -unicode_init(void) -{ - return (0); -} - -static void __exit -unicode_fini(void) -{ -} - -module_init(unicode_init); -module_exit(unicode_fini); -#endif - -ZFS_MODULE_DESCRIPTION("Unicode implementation"); -ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); -ZFS_MODULE_LICENSE(ZFS_META_LICENSE); -ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); - EXPORT_SYMBOL(u8_validate); EXPORT_SYMBOL(u8_strcmp); EXPORT_SYMBOL(u8_textprep_str); diff --git a/sys/contrib/openzfs/module/zcommon/Makefile.in b/sys/contrib/openzfs/module/zcommon/Makefile.in deleted file mode 100644 index ebc538440445..000000000000 --- a/sys/contrib/openzfs/module/zcommon/Makefile.in +++ /dev/null @@ -1,28 +0,0 @@ -ifneq ($(KBUILD_EXTMOD),) -src = @abs_srcdir@ -obj = @abs_builddir@ -endif - -MODULE := zcommon - -obj-$(CONFIG_ZFS) := $(MODULE).o - -# Suppress unused-value warnings in sparc64 architecture headers -ccflags-$(CONFIG_SPARC64) += -Wno-unused-value - -$(MODULE)-objs += cityhash.o -$(MODULE)-objs += zfeature_common.o -$(MODULE)-objs += zfs_comutil.o -$(MODULE)-objs += zfs_deleg.o -$(MODULE)-objs += zfs_fletcher.o -$(MODULE)-objs += zfs_fletcher_superscalar.o -$(MODULE)-objs += zfs_fletcher_superscalar4.o -$(MODULE)-objs += zfs_namecheck.o -$(MODULE)-objs += zfs_prop.o -$(MODULE)-objs += zpool_prop.o -$(MODULE)-objs += zprop_common.o - -$(MODULE)-$(CONFIG_X86) += zfs_fletcher_intel.o -$(MODULE)-$(CONFIG_X86) += zfs_fletcher_sse.o -$(MODULE)-$(CONFIG_X86) += zfs_fletcher_avx512.o -$(MODULE)-$(CONFIG_ARM64) += zfs_fletcher_aarch64_neon.o diff --git a/sys/contrib/openzfs/module/zcommon/zfeature_common.c b/sys/contrib/openzfs/module/zcommon/zfeature_common.c index 13dbccae2d4a..f09389e6d02e 100644 --- a/sys/contrib/openzfs/module/zcommon/zfeature_common.c +++ b/sys/contrib/openzfs/module/zcommon/zfeature_common.c @@ -696,6 +696,7 @@ zpool_feature_init(void) ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { + static const spa_feature_t zilsaxattr_deps[] = { SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURE_NONE @@ -707,6 +708,12 @@ zpool_feature_init(void) ZFEATURE_TYPE_BOOLEAN, zilsaxattr_deps, sfeatures); } + zfeature_register(SPA_FEATURE_HEAD_ERRLOG, + "com.delphix:head_errlog", "head_errlog", + "Support for per-dataset on-disk error logs.", + ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL, + sfeatures); + zfs_mod_list_supported_free(sfeatures); } diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c index 8b3e774d99ec..500d80a33b6b 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c @@ -1006,7 +1006,10 @@ uint8_t **zfs_kfpu_fpregs; EXPORT_SYMBOL(zfs_kfpu_fpregs); #endif /* defined(HAVE_KERNEL_FPU_INTERNAL) */ -static int __init +extern int __init zcommon_init(void); +extern void zcommon_fini(void); + +int __init zcommon_init(void) { int error = kfpu_init(); @@ -1018,22 +1021,19 @@ zcommon_init(void) return (0); } -static void __exit +void zcommon_fini(void) { fletcher_4_fini(); kfpu_fini(); } +#ifdef __FreeBSD__ module_init_early(zcommon_init); module_exit(zcommon_fini); - #endif -ZFS_MODULE_DESCRIPTION("Generic ZFS support"); -ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); -ZFS_MODULE_LICENSE(ZFS_META_LICENSE); -ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); +#endif /* zfs dataset property functions */ EXPORT_SYMBOL(zfs_userquota_prop_prefixes); diff --git a/sys/contrib/openzfs/module/zcommon/zprop_common.c b/sys/contrib/openzfs/module/zcommon/zprop_common.c index 0f496877577b..b1da4ca64bd5 100644 --- a/sys/contrib/openzfs/module/zcommon/zprop_common.c +++ b/sys/contrib/openzfs/module/zcommon/zprop_common.c @@ -136,7 +136,7 @@ zprop_register_string(int prop, const char *name, const char *def, const char *colname, const struct zfs_mod_supported_features *sfeatures) { zprop_register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr, - objset_types, values, colname, B_FALSE, B_TRUE, B_FALSE, NULL, + objset_types, values, colname, B_FALSE, B_TRUE, B_TRUE, NULL, sfeatures); } @@ -159,7 +159,7 @@ zprop_register_index(int prop, const char *name, uint64_t def, const struct zfs_mod_supported_features *sfeatures) { zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr, - objset_types, values, colname, B_FALSE, B_TRUE, B_FALSE, idx_tbl, + objset_types, values, colname, B_FALSE, B_TRUE, B_TRUE, idx_tbl, sfeatures); } diff --git a/sys/contrib/openzfs/module/zfs/Makefile.in b/sys/contrib/openzfs/module/zfs/Makefile.in deleted file mode 100644 index 30dc91a7eb59..000000000000 --- a/sys/contrib/openzfs/module/zfs/Makefile.in +++ /dev/null @@ -1,158 +0,0 @@ -ifneq ($(KBUILD_EXTMOD),) -src = @abs_srcdir@ -obj = @abs_builddir@ -mfdir = $(obj) -else -mfdir = $(srctree)/$(src) -endif - -MODULE := zfs - -obj-$(CONFIG_ZFS) := $(MODULE).o - -# Suppress unused-value warnings in sparc64 architecture headers -ccflags-$(CONFIG_SPARC64) += -Wno-unused-value - -$(MODULE)-objs += abd.o -$(MODULE)-objs += aggsum.o -$(MODULE)-objs += arc.o -$(MODULE)-objs += blkptr.o -$(MODULE)-objs += bplist.o -$(MODULE)-objs += bpobj.o -$(MODULE)-objs += bptree.o -$(MODULE)-objs += btree.o -$(MODULE)-objs += bqueue.o -$(MODULE)-objs += dataset_kstats.o -$(MODULE)-objs += dbuf.o -$(MODULE)-objs += dbuf_stats.o -$(MODULE)-objs += ddt.o -$(MODULE)-objs += ddt_zap.o -$(MODULE)-objs += dmu.o -$(MODULE)-objs += dmu_diff.o -$(MODULE)-objs += dmu_object.o -$(MODULE)-objs += dmu_objset.o -$(MODULE)-objs += dmu_recv.o -$(MODULE)-objs += dmu_redact.o -$(MODULE)-objs += dmu_send.o -$(MODULE)-objs += dmu_traverse.o -$(MODULE)-objs += dmu_tx.o -$(MODULE)-objs += dmu_zfetch.o -$(MODULE)-objs += dnode.o -$(MODULE)-objs += dnode_sync.o -$(MODULE)-objs += dsl_bookmark.o -$(MODULE)-objs += dsl_crypt.o -$(MODULE)-objs += dsl_dataset.o -$(MODULE)-objs += dsl_deadlist.o -$(MODULE)-objs += dsl_deleg.o -$(MODULE)-objs += dsl_destroy.o -$(MODULE)-objs += dsl_dir.o -$(MODULE)-objs += dsl_pool.o -$(MODULE)-objs += dsl_prop.o -$(MODULE)-objs += dsl_scan.o -$(MODULE)-objs += dsl_synctask.o -$(MODULE)-objs += dsl_userhold.o -$(MODULE)-objs += edonr_zfs.o -$(MODULE)-objs += fm.o -$(MODULE)-objs += gzip.o -$(MODULE)-objs += hkdf.o -$(MODULE)-objs += lz4.o -$(MODULE)-objs += lz4_zfs.o -$(MODULE)-objs += lzjb.o -$(MODULE)-objs += metaslab.o -$(MODULE)-objs += mmp.o -$(MODULE)-objs += multilist.o -$(MODULE)-objs += objlist.o -$(MODULE)-objs += pathname.o -$(MODULE)-objs += range_tree.o -$(MODULE)-objs += refcount.o -$(MODULE)-objs += rrwlock.o -$(MODULE)-objs += sa.o -$(MODULE)-objs += sha256.o -$(MODULE)-objs += skein_zfs.o -$(MODULE)-objs += spa.o -$(MODULE)-objs += spa_boot.o -$(MODULE)-objs += spa_checkpoint.o -$(MODULE)-objs += spa_config.o -$(MODULE)-objs += spa_errlog.o -$(MODULE)-objs += spa_history.o -$(MODULE)-objs += spa_log_spacemap.o -$(MODULE)-objs += spa_misc.o -$(MODULE)-objs += spa_stats.o -$(MODULE)-objs += space_map.o -$(MODULE)-objs += space_reftree.o -$(MODULE)-objs += txg.o -$(MODULE)-objs += uberblock.o -$(MODULE)-objs += unique.o -$(MODULE)-objs += vdev.o -$(MODULE)-objs += vdev_cache.o -$(MODULE)-objs += vdev_draid.o -$(MODULE)-objs += vdev_draid_rand.o -$(MODULE)-objs += vdev_indirect.o -$(MODULE)-objs += vdev_indirect_births.o -$(MODULE)-objs += vdev_indirect_mapping.o -$(MODULE)-objs += vdev_initialize.o -$(MODULE)-objs += vdev_label.o -$(MODULE)-objs += vdev_mirror.o -$(MODULE)-objs += vdev_missing.o -$(MODULE)-objs += vdev_queue.o -$(MODULE)-objs += vdev_raidz.o -$(MODULE)-objs += vdev_raidz_math.o -$(MODULE)-objs += vdev_raidz_math_scalar.o -$(MODULE)-objs += vdev_rebuild.o -$(MODULE)-objs += vdev_removal.o -$(MODULE)-objs += vdev_root.o -$(MODULE)-objs += vdev_trim.o -$(MODULE)-objs += zap.o -$(MODULE)-objs += zap_leaf.o -$(MODULE)-objs += zap_micro.o -$(MODULE)-objs += zcp.o -$(MODULE)-objs += zcp_get.o -$(MODULE)-objs += zcp_global.o -$(MODULE)-objs += zcp_iter.o -$(MODULE)-objs += zcp_set.o -$(MODULE)-objs += zcp_synctask.o -$(MODULE)-objs += zfeature.o -$(MODULE)-objs += zfs_byteswap.o -$(MODULE)-objs += zfs_fm.o -$(MODULE)-objs += zfs_fuid.o -$(MODULE)-objs += zfs_ioctl.o -$(MODULE)-objs += zfs_log.o -$(MODULE)-objs += zfs_onexit.o -$(MODULE)-objs += zfs_quota.o -$(MODULE)-objs += zfs_ratelimit.o -$(MODULE)-objs += zfs_replay.o -$(MODULE)-objs += zfs_rlock.o -$(MODULE)-objs += zfs_sa.o -$(MODULE)-objs += zfs_vnops.o -$(MODULE)-objs += zil.o -$(MODULE)-objs += zio.o -$(MODULE)-objs += zio_checksum.o -$(MODULE)-objs += zio_compress.o -$(MODULE)-objs += zio_inject.o -$(MODULE)-objs += zle.o -$(MODULE)-objs += zrlock.o -$(MODULE)-objs += zthr.o -$(MODULE)-objs += zvol.o - -# Suppress incorrect warnings from versions of objtool which are not -# aware of x86 EVEX prefix instructions used for AVX512. -OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y -OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y - -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_sse2.o -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_ssse3.o -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx2.o -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512f.o -$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512bw.o - -$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neon.o -$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neonx2.o - -$(MODULE)-$(CONFIG_PPC) += vdev_raidz_math_powerpc_altivec.o -$(MODULE)-$(CONFIG_PPC64) += vdev_raidz_math_powerpc_altivec.o - -ifeq ($(CONFIG_ALTIVEC),y) -$(obj)/vdev_raidz_math_powerpc_altivec.o: c_flags += -maltivec -endif - -include $(mfdir)/../os/linux/zfs/Makefile diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 79e754c4abcb..af42670cc2c9 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -9337,26 +9337,37 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, } if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { - cabd = abd_alloc_for_io(asize, ismd); - tmp = abd_borrow_buf(cabd, asize); + /* + * In some cases, we can wind up with size > asize, so + * we need to opt for the larger allocation option here. + * + * (We also need abd_return_buf_copy in all cases because + * it's an ASSERT() to modify the buffer before returning it + * with arc_return_buf(), and all the compressors + * write things before deciding to fail compression in nearly + * every case.) + */ + cabd = abd_alloc_for_io(size, ismd); + tmp = abd_borrow_buf(cabd, size); psize = zio_compress_data(compress, to_write, tmp, size, hdr->b_complevel); - if (psize >= size) { - abd_return_buf(cabd, tmp, asize); + if (psize >= asize) { + psize = HDR_GET_PSIZE(hdr); + abd_return_buf_copy(cabd, tmp, size); HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); to_write = cabd; - abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); - if (size != asize) - abd_zero_off(to_write, size, asize - size); + abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); + if (psize != asize) + abd_zero_off(to_write, psize, asize - psize); goto encrypt; } ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); if (psize < asize) memset((char *)tmp + psize, 0, asize - psize); psize = HDR_GET_PSIZE(hdr); - abd_return_buf_copy(cabd, tmp, asize); + abd_return_buf_copy(cabd, tmp, size); to_write = cabd; } @@ -11045,20 +11056,20 @@ EXPORT_SYMBOL(arc_add_prune_callback); EXPORT_SYMBOL(arc_remove_prune_callback); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min, - param_get_long, ZMOD_RW, "Min arc size"); + param_get_long, ZMOD_RW, "Minimum ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max, - param_get_long, ZMOD_RW, "Max arc size"); + param_get_long, ZMOD_RW, "Maximum ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long, - param_get_long, ZMOD_RW, "Metadata limit for arc size"); + param_get_long, ZMOD_RW, "Metadata limit for ARC size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent, param_set_arc_long, param_get_long, ZMOD_RW, - "Percent of arc size for arc meta limit"); + "Percent of ARC size for ARC meta limit"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long, - param_get_long, ZMOD_RW, "Min arc metadata"); + param_get_long, ZMOD_RW, "Minimum ARC metadata size in bytes"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW, "Meta objects to scan for prune"); @@ -11070,16 +11081,16 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW, "Meta reclaim strategy"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, - param_get_int, ZMOD_RW, "Seconds before growing arc size"); + param_get_int, ZMOD_RW, "Seconds before growing ARC size"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW, "Disable arc_p adapt dampener"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, - param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)"); + param_get_int, ZMOD_RW, "log2(fraction of ARC to reclaim)"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, - "Percent of pagecache to reclaim arc to"); + "Percent of pagecache to reclaim ARC to"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int, param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p"); @@ -11088,7 +11099,7 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD, "Target average block size"); ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW, - "Disable compressed arc buffers"); + "Disable compressed ARC buffers"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int, param_get_int, ZMOD_RW, "Min life of prefetch block in ms"); @@ -11149,7 +11160,7 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long, param_get_long, ZMOD_RW, "System free memory target size in bytes"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long, - param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc"); + param_get_long, ZMOD_RW, "Minimum bytes of dnodes in ARC"); ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, param_set_arc_long, param_get_long, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index ee2470b38606..9a273b010fb1 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -339,18 +339,18 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; - mutex_enter(DBUF_HASH_MUTEX(h, idx)); + rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_READER); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { - mutex_exit(DBUF_HASH_MUTEX(h, idx)); + rw_exit(DBUF_HASH_RWLOCK(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } - mutex_exit(DBUF_HASH_MUTEX(h, idx)); + rw_exit(DBUF_HASH_RWLOCK(h, idx)); return (NULL); } @@ -393,13 +393,13 @@ dbuf_hash_insert(dmu_buf_impl_t *db) hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; - mutex_enter(DBUF_HASH_MUTEX(h, idx)); + rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER); for (dbf = h->hash_table[idx], i = 0; dbf != NULL; dbf = dbf->db_hash_next, i++) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { - mutex_exit(DBUF_HASH_MUTEX(h, idx)); + rw_exit(DBUF_HASH_RWLOCK(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); @@ -417,7 +417,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; - mutex_exit(DBUF_HASH_MUTEX(h, idx)); + rw_exit(DBUF_HASH_RWLOCK(h, idx)); uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64); DBUF_STAT_MAX(hash_elements_max, he); @@ -474,13 +474,13 @@ dbuf_hash_remove(dmu_buf_impl_t *db) /* * We mustn't hold db_mtx to maintain lock ordering: - * DBUF_HASH_MUTEX > db_mtx. + * DBUF_HASH_RWLOCK > db_mtx. */ ASSERT(zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); - mutex_enter(DBUF_HASH_MUTEX(h, idx)); + rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; @@ -491,7 +491,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db) if (h->hash_table[idx] && h->hash_table[idx]->db_hash_next == NULL) DBUF_STAT_BUMPDOWN(hash_chains); - mutex_exit(DBUF_HASH_MUTEX(h, idx)); + rw_exit(DBUF_HASH_RWLOCK(h, idx)); atomic_dec_64(&dbuf_stats.hash_elements.value.ui64); } @@ -914,8 +914,8 @@ retry: sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); - for (i = 0; i < DBUF_MUTEXES; i++) - mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + for (i = 0; i < DBUF_RWLOCKS; i++) + rw_init(&h->hash_rwlocks[i], NULL, RW_DEFAULT, NULL); dbuf_stats_init(h); @@ -981,8 +981,8 @@ dbuf_fini(void) dbuf_stats_destroy(); - for (i = 0; i < DBUF_MUTEXES; i++) - mutex_destroy(&h->hash_mutexes[i]); + for (i = 0; i < DBUF_RWLOCKS; i++) + rw_destroy(&h->hash_rwlocks[i]); #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages @@ -3947,7 +3947,7 @@ dmu_buf_get_user(dmu_buf_t *db_fake) } void -dmu_buf_user_evict_wait() +dmu_buf_user_evict_wait(void) { taskq_wait(dbu_evict_taskq); } diff --git a/sys/contrib/openzfs/module/zfs/dbuf_stats.c b/sys/contrib/openzfs/module/zfs/dbuf_stats.c index fa9a5f08060a..a42750ac8e90 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf_stats.c +++ b/sys/contrib/openzfs/module/zfs/dbuf_stats.c @@ -137,7 +137,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data) if (size) buf[0] = 0; - mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); + rw_enter(DBUF_HASH_RWLOCK(h, dsh->idx), RW_READER); for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { /* * Returning ENOMEM will cause the data and header functions @@ -158,7 +158,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data) mutex_exit(&db->db_mtx); } - mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); + rw_exit(DBUF_HASH_RWLOCK(h, dsh->idx)); return (error); } diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index 461feeffb6a3..7d8b2c96bd74 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -86,7 +86,7 @@ static int zfs_dmu_offset_next_sync = 1; * helps to limit the amount of memory that can be used by prefetching. * Larger objects should be prefetched a bit at a time. */ -static int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; +int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c index e836d681e920..ca894c35253c 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -73,12 +73,19 @@ * The SPA supports block sizes up to 16MB. However, very large blocks * can have an impact on i/o latency (e.g. tying up a spinning disk for * ~300ms), and also potentially on the memory allocator. Therefore, - * we do not allow the recordsize to be set larger than zfs_max_recordsize - * (default 1MB). Larger blocks can be created by changing this tunable, - * and pools with larger blocks can always be imported and used, regardless - * of this setting. + * we did not allow the recordsize to be set larger than zfs_max_recordsize + * (former default: 1MB). Larger blocks could be created by changing this + * tunable, and pools with larger blocks could always be imported and used, + * regardless of this setting. + * + * We do, however, still limit it by default to 1M on x86_32, because Linux's + * 3/1 memory split doesn't leave much room for 16M chunks. */ -int zfs_max_recordsize = 1 * 1024 * 1024; +#ifdef _ILP32 +int zfs_max_recordsize = 1 * 1024 * 1024; +#else +int zfs_max_recordsize = 16 * 1024 * 1024; +#endif static int zfs_allow_redacted_dataset_mount = 0; #define SWITCH64(x, y) \ @@ -3708,6 +3715,15 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) dsl_dir_rele(odd, FTAG); promote_rele(ddpa, FTAG); + + /* + * Transfer common error blocks from old head to new head. + */ + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) { + uint64_t old_head = origin_head->ds_object; + uint64_t new_head = hds->ds_object; + spa_swap_errlog(dp->dp_spa, new_head, old_head, tx); + } } /* @@ -4924,13 +4940,38 @@ dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps, ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa; } -#if defined(_LP64) -#define RECORDSIZE_PERM ZMOD_RW -#else -/* Limited to 1M on 32-bit platforms due to lack of virtual address space */ -#define RECORDSIZE_PERM ZMOD_RD -#endif -ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, RECORDSIZE_PERM, +/* + * Find and return (in *oldest_dsobj) the oldest snapshot of the dsobj + * dataset whose birth time is >= min_txg. + */ +int +dsl_dataset_oldest_snapshot(spa_t *spa, uint64_t head_ds, uint64_t min_txg, + uint64_t *oldest_dsobj) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds); + if (error != 0) + return (error); + + uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + + while (prev_obj != 0 && min_txg < prev_obj_txg) { + dsl_dataset_rele(ds, FTAG); + if ((error = dsl_dataset_hold_obj(dp, prev_obj, + FTAG, &ds)) != 0) + return (error); + prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + } + *oldest_dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + return (0); +} + +ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, ZMOD_RW, "Max allowed record size"); ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW, diff --git a/sys/contrib/openzfs/module/zfs/dsl_destroy.c b/sys/contrib/openzfs/module/zfs/dsl_destroy.c index b32929b3320c..7dddd8eed5e9 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_destroy.c +++ b/sys/contrib/openzfs/module/zfs/dsl_destroy.c @@ -1153,6 +1153,9 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); dsl_dataset_rele(prev, FTAG); } + /* Delete errlog. */ + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_HEAD_ERRLOG)) + spa_delete_dataset_errlog(dp->dp_spa, ds->ds_object, tx); } void diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index 7ed83b305db7..ab32bfec1310 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -48,10 +48,10 @@ /* * Metaslab granularity, in bytes. This is roughly similar to what would be * referred to as the "stripe size" in traditional RAID arrays. In normal - * operation, we will try to write this amount of data to a top-level vdev - * before moving on to the next one. + * operation, we will try to write this amount of data to each disk before + * moving on to the next top-level vdev. */ -static unsigned long metaslab_aliquot = 512 << 10; +static unsigned long metaslab_aliquot = 1024 * 1024; /* * For testing, make some blocks above a certain size be gang blocks. @@ -899,7 +899,8 @@ metaslab_group_activate(metaslab_group_t *mg) if (++mg->mg_activation_count <= 0) return; - mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); + mg->mg_aliquot = metaslab_aliquot * MAX(1, + vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd)); metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { @@ -2750,7 +2751,8 @@ metaslab_fini_flush_data(metaslab_t *msp) mutex_exit(&spa->spa_flushed_ms_lock); spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); - spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp)); + spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp), + metaslab_unflushed_dirty(msp)); } uint64_t @@ -3728,50 +3730,45 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) metaslab_flush_update(msp, tx); } -/* - * Called when the metaslab has been flushed (its own spacemap now reflects - * all the contents of the pool-wide spacemap log). Updates the metaslab's - * metadata and any pool-wide related log space map data (e.g. summary, - * obsolete logs, etc..) to reflect that. - */ static void -metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) +metaslab_unflushed_add(metaslab_t *msp, dmu_tx_t *tx) { - metaslab_group_t *mg = msp->ms_group; - spa_t *spa = mg->mg_vd->vdev_spa; - - ASSERT(MUTEX_HELD(&msp->ms_lock)); - - ASSERT3U(spa_sync_pass(spa), ==, 1); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + ASSERT(spa_syncing_log_sm(spa) != NULL); + ASSERT(msp->ms_sm != NULL); ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); - /* - * Just because a metaslab got flushed, that doesn't mean that - * it will pass through metaslab_sync_done(). Thus, make sure to - * update ms_synced_length here in case it doesn't. - */ - msp->ms_synced_length = space_map_length(msp->ms_sm); + mutex_enter(&spa->spa_flushed_ms_lock); + metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + metaslab_set_unflushed_dirty(msp, B_TRUE); + avl_add(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); - /* - * We may end up here from metaslab_condense() without the - * feature being active. In that case this is a no-op. - */ - if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) - return; + spa_log_sm_increment_current_mscount(spa); + spa_log_summary_add_flushed_metaslab(spa, B_TRUE); +} +void +metaslab_unflushed_bump(metaslab_t *msp, dmu_tx_t *tx, boolean_t dirty) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; ASSERT(spa_syncing_log_sm(spa) != NULL); ASSERT(msp->ms_sm != NULL); ASSERT(metaslab_unflushed_txg(msp) != 0); ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); /* update metaslab's position in our flushing tree */ uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); + boolean_t ms_prev_flushed_dirty = metaslab_unflushed_dirty(msp); mutex_enter(&spa->spa_flushed_ms_lock); avl_remove(&spa->spa_metaslabs_by_flushed, msp); metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + metaslab_set_unflushed_dirty(msp, dirty); avl_add(&spa->spa_metaslabs_by_flushed, msp); mutex_exit(&spa->spa_flushed_ms_lock); @@ -3779,17 +3776,47 @@ metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); spa_log_sm_increment_current_mscount(spa); + /* update log space map summary */ + spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg, + ms_prev_flushed_dirty); + spa_log_summary_add_flushed_metaslab(spa, dirty); + /* cleanup obsolete logs if any */ - uint64_t log_blocks_before = spa_log_sm_nblocks(spa); spa_cleanup_old_sm_logs(spa, tx); - uint64_t log_blocks_after = spa_log_sm_nblocks(spa); - VERIFY3U(log_blocks_after, <=, log_blocks_before); +} - /* update log space map summary */ - uint64_t blocks_gone = log_blocks_before - log_blocks_after; - spa_log_summary_add_flushed_metaslab(spa); - spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg); - spa_log_summary_decrement_blkcount(spa, blocks_gone); +/* + * Called when the metaslab has been flushed (its own spacemap now reflects + * all the contents of the pool-wide spacemap log). Updates the metaslab's + * metadata and any pool-wide related log space map data (e.g. summary, + * obsolete logs, etc..) to reflect that. + */ +static void +metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) +{ + metaslab_group_t *mg = msp->ms_group; + spa_t *spa = mg->mg_vd->vdev_spa; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + ASSERT3U(spa_sync_pass(spa), ==, 1); + + /* + * Just because a metaslab got flushed, that doesn't mean that + * it will pass through metaslab_sync_done(). Thus, make sure to + * update ms_synced_length here in case it doesn't. + */ + msp->ms_synced_length = space_map_length(msp->ms_sm); + + /* + * We may end up here from metaslab_condense() without the + * feature being active. In that case this is a no-op. + */ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP) || + metaslab_unflushed_txg(msp) == 0) + return; + + metaslab_unflushed_bump(msp, tx, B_FALSE); } boolean_t @@ -4005,23 +4032,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) ASSERT0(metaslab_allocated_space(msp)); } - if (metaslab_unflushed_txg(msp) == 0 && - spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { - ASSERT(spa_syncing_log_sm(spa) != NULL); - - metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); - spa_log_sm_increment_current_mscount(spa); - spa_log_summary_add_flushed_metaslab(spa); - - ASSERT(msp->ms_sm != NULL); - mutex_enter(&spa->spa_flushed_ms_lock); - avl_add(&spa->spa_metaslabs_by_flushed, msp); - mutex_exit(&spa->spa_flushed_ms_lock); - - ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); - ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); - } - if (!range_tree_is_empty(msp->ms_checkpointing) && vd->vdev_checkpoint_sm == NULL) { ASSERT(spa_has_checkpoint(spa)); @@ -4069,6 +4079,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_t *log_sm = spa_syncing_log_sm(spa); if (log_sm != NULL) { ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); + if (metaslab_unflushed_txg(msp) == 0) + metaslab_unflushed_add(msp, tx); + else if (!metaslab_unflushed_dirty(msp)) + metaslab_unflushed_bump(msp, tx, B_TRUE); space_map_write(log_sm, alloctree, SM_ALLOC, vd->vdev_id, tx); @@ -6131,6 +6145,12 @@ metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) mutex_exit(&mg->mg_ms_disabled_lock); } +void +metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty) +{ + ms->ms_unflushed_dirty = dirty; +} + static void metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) { @@ -6167,15 +6187,16 @@ metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) void metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) { - spa_t *spa = ms->ms_group->mg_vd->vdev_spa; - - if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) - return; - ms->ms_unflushed_txg = txg; metaslab_update_ondisk_flush_data(ms, tx); } +boolean_t +metaslab_unflushed_dirty(metaslab_t *ms) +{ + return (ms->ms_unflushed_dirty); +} + uint64_t metaslab_unflushed_txg(metaslab_t *ms) { diff --git a/sys/contrib/openzfs/module/zfs/sa.c b/sys/contrib/openzfs/module/zfs/sa.c index 2b6776581a47..db8c2b831f1d 100644 --- a/sys/contrib/openzfs/module/zfs/sa.c +++ b/sys/contrib/openzfs/module/zfs/sa.c @@ -1068,8 +1068,8 @@ sa_setup(objset_t *os, uint64_t sa_obj, const sa_attr_reg_t *reg_attrs, za.za_num_integers); break; } - VERIFY(ddi_strtoull(za.za_name, NULL, 10, - (unsigned long long *)&lot_num) == 0); + VERIFY0(ddi_strtoull(za.za_name, NULL, 10, + (unsigned long long *)&lot_num)); (void) sa_add_layout_entry(os, lot_attrs, za.za_num_integers, lot_num, diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index e69cb5527be8..01114dedef48 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -4355,7 +4355,7 @@ spa_ld_load_vdev_metadata(spa_t *spa) error = spa_ld_log_spacemaps(spa); if (error != 0) { - spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]", + spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", error); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } diff --git a/sys/contrib/openzfs/module/zfs/spa_errlog.c b/sys/contrib/openzfs/module/zfs/spa_errlog.c index c6b28ea7d1b8..9e5d1de63c0b 100644 --- a/sys/contrib/openzfs/module/zfs/spa_errlog.c +++ b/sys/contrib/openzfs/module/zfs/spa_errlog.c @@ -20,7 +20,8 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2014, Delphix. All rights reserved. + * Copyright (c) 2021, George Amanakis. All rights reserved. */ /* @@ -43,6 +44,16 @@ * calculation when the data is requested, storing the result so future queries * will be faster. * + * If the head_errlog feature is enabled, a different on-disk format is used. + * The error log of each head dataset is stored separately in the zap object + * and keyed by the head id. This enables listing every dataset affected in + * userland. In order to be able to track whether an error block has been + * modified or added to snapshots since it was marked as an error, a new tuple + * is introduced: zbookmark_err_phys_t. It allows the storage of the birth + * transaction group of an error block on-disk. The birth transaction group is + * used by check_filesystem() to assess whether this block was freed, + * re-written or added to a snapshot since its marking as an error. + * * This log is then shipped into an nvlist where the key is the dataset name and * the value is the object name. Userland is then responsible for uniquifying * this list and displaying it to the user. @@ -53,7 +64,17 @@ #include <sys/spa_impl.h> #include <sys/zap.h> #include <sys/zio.h> +#include <sys/dsl_dir.h> +#include <sys/dmu_objset.h> +#include <sys/dbuf.h> +/* + * spa_upgrade_errlog_limit : A zfs module parameter that controls the number + * of on-disk error log entries that will be converted to the new + * format when enabling head_errlog. Defaults to 0 which converts + * all log entries. + */ +static uint32_t spa_upgrade_errlog_limit = 0; /* * Convert a bookmark to a string. @@ -67,9 +88,35 @@ bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len) } /* - * Convert a string to a bookmark + * Convert an err_phys to a string. + */ +static void +errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len) +{ + (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", + (u_longlong_t)zep->zb_object, (u_longlong_t)zep->zb_level, + (u_longlong_t)zep->zb_blkid, (u_longlong_t)zep->zb_birth); +} + +/* + * Convert a string to a err_phys. + */ +static void +name_to_errphys(char *buf, zbookmark_err_phys_t *zep) +{ + zep->zb_object = zfs_strtonum(buf, &buf); + ASSERT(*buf == ':'); + zep->zb_level = (int)zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zep->zb_blkid = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zep->zb_birth = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == '\0'); +} + +/* + * Convert a string to a bookmark. */ -#ifdef _KERNEL static void name_to_bookmark(char *buf, zbookmark_phys_t *zb) { @@ -82,8 +129,74 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) zb->zb_blkid = zfs_strtonum(buf + 1, &buf); ASSERT(*buf == '\0'); } + +#ifdef _KERNEL +static void +zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) +{ + zb->zb_objset = dataset; + zb->zb_object = zep->zb_object; + zb->zb_level = zep->zb_level; + zb->zb_blkid = zep->zb_blkid; +} #endif +static void +name_to_object(char *buf, uint64_t *obj) +{ + *obj = zfs_strtonum(buf, &buf); + ASSERT(*buf == '\0'); +} + +static int +get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, + uint64_t *head_dataset_id) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_dataset_t *ds; + objset_t *os; + + dsl_pool_config_enter(dp, FTAG); + int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds); + if (error != 0) { + dsl_pool_config_exit(dp, FTAG); + return (error); + } + ASSERT(head_dataset_id); + *head_dataset_id = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + + error = dmu_objset_from_ds(ds, &os); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (error); + } + + dnode_t *dn; + blkptr_t bp; + + error = dnode_hold(os, zep->zb_object, FTAG, &dn); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (error); + } + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, + NULL); + + if (error == 0 && BP_IS_HOLE(&bp)) + error = SET_ERROR(ENOENT); + + zep->zb_birth = bp.blk_birth; + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (error); +} + /* * Log an uncorrectable error to the persistent error log. We add it to the * spa's list of pending errors. The changes are actually synced out to disk @@ -128,6 +241,276 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) mutex_exit(&spa->spa_errlist_lock); } +#ifdef _KERNEL +static int +find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, + uint64_t *birth_txg) +{ + objset_t *os; + int error = dmu_objset_from_ds(ds, &os); + if (error != 0) + return (error); + + dnode_t *dn; + blkptr_t bp; + + error = dnode_hold(os, zep->zb_object, FTAG, &dn); + if (error != 0) + return (error); + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, + NULL); + + if (error == 0 && BP_IS_HOLE(&bp)) + error = SET_ERROR(ENOENT); + + *birth_txg = bp.blk_birth; + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (error); +} + +/* + * This function serves a double role. If only_count is true, it returns + * (in *count) how many times an error block belonging to this filesystem is + * referenced by snapshots or clones. If only_count is false, each time the + * error block is referenced by a snapshot or clone, it fills the userspace + * array at uaddr with the bookmarks of the error blocks. The array is filled + * from the back and *count is modified to be the number of unused entries at + * the beginning of the array. + */ +static int +check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, + uint64_t *count, void *uaddr, boolean_t only_count) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds); + if (error != 0) + return (error); + + uint64_t latest_txg; + uint64_t txg_to_consider = spa->spa_syncing_txg; + boolean_t check_snapshot = B_TRUE; + error = find_birth_txg(ds, zep, &latest_txg); + if (error == 0) { + if (zep->zb_birth == latest_txg) { + /* Block neither free nor rewritten. */ + if (!only_count) { + zbookmark_phys_t zb; + zep_to_zb(head_ds, zep, &zb); + if (copyout(&zb, (char *)uaddr + (*count - 1) + * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EFAULT)); + } + (*count)--; + } else { + (*count)++; + } + check_snapshot = B_FALSE; + } else { + ASSERT3U(zep->zb_birth, <, latest_txg); + txg_to_consider = latest_txg; + } + } + + /* How many snapshots reference this block. */ + uint64_t snap_count; + error = zap_count(spa->spa_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (snap_count == 0) { + /* File system has no snapshot. */ + dsl_dataset_rele(ds, FTAG); + return (0); + } + + uint64_t *snap_obj_array = kmem_alloc(snap_count * sizeof (uint64_t), + KM_SLEEP); + + int aff_snap_count = 0; + uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + + /* Check only snapshots created from this file system. */ + while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && + snap_obj_txg <= txg_to_consider) { + + dsl_dataset_rele(ds, FTAG); + error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); + if (error != 0) + goto out; + + if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) + break; + + boolean_t affected = B_TRUE; + if (check_snapshot) { + uint64_t blk_txg; + error = find_birth_txg(ds, zep, &blk_txg); + affected = (error == 0 && zep->zb_birth == blk_txg); + } + + if (affected) { + snap_obj_array[aff_snap_count] = snap_obj; + aff_snap_count++; + + if (!only_count) { + zbookmark_phys_t zb; + zep_to_zb(snap_obj, zep, &zb); + if (copyout(&zb, (char *)uaddr + (*count - 1) * + sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + dsl_dataset_rele(ds, FTAG); + error = SET_ERROR(EFAULT); + goto out; + } + (*count)--; + } else { + (*count)++; + } + + /* + * Only clones whose origins were affected could also + * have affected snapshots. + */ + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + error = check_filesystem(spa, + za.za_first_integer, zep, + count, uaddr, only_count); + + if (error != 0) { + zap_cursor_fini(&zc); + goto out; + } + } + zap_cursor_fini(&zc); + } + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + } + dsl_dataset_rele(ds, FTAG); + +out: + kmem_free(snap_obj_array, sizeof (*snap_obj_array)); + return (error); +} + +static int +find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, + uint64_t *top_affected_fs) +{ + uint64_t oldest_dsobj; + int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth, + &oldest_dsobj); + if (error != 0) + return (error); + + dsl_dataset_t *ds; + error = dsl_dataset_hold_obj(spa->spa_dsl_pool, oldest_dsobj, + FTAG, &ds); + if (error != 0) + return (error); + + *top_affected_fs = + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static int +process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, + uint64_t *count, void *uaddr, boolean_t only_count) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_pool_config_enter(dp, FTAG); + uint64_t top_affected_fs; + + int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs); + if (error == 0) + error = check_filesystem(spa, top_affected_fs, zep, count, + uaddr, only_count); + + dsl_pool_config_exit(dp, FTAG); + return (error); +} + +static uint64_t +get_errlog_size(spa_t *spa, uint64_t spa_err_obj) +{ + if (spa_err_obj == 0) + return (0); + uint64_t total = 0; + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + + zap_cursor_t head_ds_cursor; + zap_attribute_t head_ds_attr; + zbookmark_err_phys_t head_ds_block; + + uint64_t head_ds; + name_to_object(za.za_name, &head_ds); + + for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, + za.za_first_integer); zap_cursor_retrieve(&head_ds_cursor, + &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { + + name_to_errphys(head_ds_attr.za_name, &head_ds_block); + (void) process_error_block(spa, head_ds, &head_ds_block, + &total, NULL, B_TRUE); + } + zap_cursor_fini(&head_ds_cursor); + } + zap_cursor_fini(&zc); + return (total); +} + +static uint64_t +get_errlist_size(spa_t *spa, avl_tree_t *tree) +{ + if (avl_numnodes(tree) == 0) + return (0); + uint64_t total = 0; + + spa_error_entry_t *se; + for (se = avl_first(tree); se != NULL; se = AVL_NEXT(tree, se)) { + zbookmark_err_phys_t zep; + zep.zb_object = se->se_bookmark.zb_object; + zep.zb_level = se->se_bookmark.zb_level; + zep.zb_blkid = se->se_bookmark.zb_blkid; + + /* + * If we cannot find out the head dataset and birth txg of + * the present error block, we opt not to error out. In the + * next pool sync this information will be retrieved by + * sync_error_list() and written to the on-disk error log. + */ + uint64_t head_ds_obj; + if (get_head_and_birth_txg(spa, &zep, + se->se_bookmark.zb_objset, &head_ds_obj) == 0) + (void) process_error_block(spa, head_ds_obj, &zep, + &total, NULL, B_TRUE); + } + return (total); +} +#endif + /* * Return the number of errors currently in the error log. This is actually the * sum of both the last log and the current log, since we don't know the union @@ -136,83 +519,284 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) uint64_t spa_get_errlog_size(spa_t *spa) { - uint64_t total = 0, count; + uint64_t total = 0; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + mutex_enter(&spa->spa_errlog_lock); + uint64_t count; + if (spa->spa_errlog_scrub != 0 && + zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, + &count) == 0) + total += count; + + if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && + zap_count(spa->spa_meta_objset, spa->spa_errlog_last, + &count) == 0) + total += count; + mutex_exit(&spa->spa_errlog_lock); + + mutex_enter(&spa->spa_errlist_lock); + total += avl_numnodes(&spa->spa_errlist_last); + total += avl_numnodes(&spa->spa_errlist_scrub); + mutex_exit(&spa->spa_errlist_lock); + } else { +#ifdef _KERNEL + mutex_enter(&spa->spa_errlog_lock); + total += get_errlog_size(spa, spa->spa_errlog_last); + total += get_errlog_size(spa, spa->spa_errlog_scrub); + mutex_exit(&spa->spa_errlog_lock); + + mutex_enter(&spa->spa_errlist_lock); + total += get_errlist_size(spa, &spa->spa_errlist_last); + total += get_errlist_size(spa, &spa->spa_errlist_scrub); + mutex_exit(&spa->spa_errlist_lock); +#endif + } + return (total); +} - mutex_enter(&spa->spa_errlog_lock); - if (spa->spa_errlog_scrub != 0 && - zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, - &count) == 0) - total += count; - - if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && - zap_count(spa->spa_meta_objset, spa->spa_errlog_last, - &count) == 0) - total += count; - mutex_exit(&spa->spa_errlog_lock); +/* + * This function sweeps through an on-disk error log and stores all bookmarks + * as error bookmarks in a new ZAP object. At the end we discard the old one, + * and spa_update_errlog() will set the spa's on-disk error log to new ZAP + * object. + */ +static void +sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, + dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + zbookmark_phys_t zb; + uint64_t count; - mutex_enter(&spa->spa_errlist_lock); - total += avl_numnodes(&spa->spa_errlist_last); - total += avl_numnodes(&spa->spa_errlist_scrub); - mutex_exit(&spa->spa_errlist_lock); + *newobj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, + DMU_OT_NONE, 0, tx); - return (total); + /* + * If we cannnot perform the upgrade we should clear the old on-disk + * error logs. + */ + if (zap_count(spa->spa_meta_objset, spa_err_obj, &count) != 0) { + VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); + return; + } + + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + if (spa_upgrade_errlog_limit != 0 && + zc.zc_cd == spa_upgrade_errlog_limit) + break; + + name_to_bookmark(za.za_name, &zb); + + zbookmark_err_phys_t zep; + zep.zb_object = zb.zb_object; + zep.zb_level = zb.zb_level; + zep.zb_blkid = zb.zb_blkid; + + /* + * We cannot use get_head_and_birth_txg() because it will + * acquire the pool config lock, which we already have. In case + * of an error we simply continue. + */ + uint64_t head_dataset_obj; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_dataset_t *ds; + objset_t *os; + + int error = dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds); + if (error != 0) + continue; + + head_dataset_obj = + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + + /* + * The objset and the dnode are required for getting the block + * pointer, which is used to determine if BP_IS_HOLE(). If + * getting the objset or the dnode fails, do not create a + * zap entry (presuming we know the dataset) as this may create + * spurious errors that we cannot ever resolve. If an error is + * truly persistent, it should re-appear after a scan. + */ + if (dmu_objset_from_ds(ds, &os) != 0) { + dsl_dataset_rele(ds, FTAG); + continue; + } + + dnode_t *dn; + blkptr_t bp; + + if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) { + dsl_dataset_rele(ds, FTAG); + continue; + } + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + error = dbuf_dnode_findbp(dn, zep.zb_level, zep.zb_blkid, &bp, + NULL, NULL); + + zep.zb_birth = bp.blk_birth; + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + + if (error != 0 || BP_IS_HOLE(&bp)) + continue; + + uint64_t err_obj; + error = zap_lookup_int_key(spa->spa_meta_objset, *newobj, + head_dataset_obj, &err_obj); + + if (error == ENOENT) { + err_obj = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); + + (void) zap_update_int_key(spa->spa_meta_objset, + *newobj, head_dataset_obj, err_obj, tx); + } + + char buf[64]; + char *name = ""; + errphys_to_name(&zep, buf, sizeof (buf)); + + (void) zap_update(spa->spa_meta_objset, err_obj, + buf, 1, strlen(name) + 1, name, tx); + } + zap_cursor_fini(&zc); + + VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); +} + +void +spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx) +{ + uint64_t newobj = 0; + + mutex_enter(&spa->spa_errlog_lock); + if (spa->spa_errlog_last != 0) { + sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx); + spa->spa_errlog_last = newobj; + } + + if (spa->spa_errlog_scrub != 0) { + sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx); + spa->spa_errlog_scrub = newobj; + } + mutex_exit(&spa->spa_errlog_lock); } #ifdef _KERNEL +/* + * If an error block is shared by two datasets it will be counted twice. For + * detailed message see spa_get_errlog_size() above. + */ static int -process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) +process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count) { zap_cursor_t zc; zap_attribute_t za; - zbookmark_phys_t zb; if (obj == 0) return (0); - for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + if (*count == 0) { + zap_cursor_fini(&zc); + return (SET_ERROR(ENOMEM)); + } + + zbookmark_phys_t zb; + name_to_bookmark(za.za_name, &zb); + + if (copyout(&zb, (char *)uaddr + + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + zap_cursor_fini(&zc); + return (SET_ERROR(EFAULT)); + } + *count -= 1; - if (*count == 0) { - zap_cursor_fini(&zc); - return (SET_ERROR(ENOMEM)); } + zap_cursor_fini(&zc); + return (0); + } - name_to_bookmark(za.za_name, &zb); + for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { - if (copyout(&zb, (char *)addr + - (*count - 1) * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) { - zap_cursor_fini(&zc); - return (SET_ERROR(EFAULT)); + zap_cursor_t head_ds_cursor; + zap_attribute_t head_ds_attr; + + uint64_t head_ds_err_obj = za.za_first_integer; + uint64_t head_ds; + name_to_object(za.za_name, &head_ds); + for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, + head_ds_err_obj); zap_cursor_retrieve(&head_ds_cursor, + &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { + + zbookmark_err_phys_t head_ds_block; + name_to_errphys(head_ds_attr.za_name, &head_ds_block); + int error = process_error_block(spa, head_ds, + &head_ds_block, count, uaddr, B_FALSE); + + if (error != 0) { + zap_cursor_fini(&head_ds_cursor); + zap_cursor_fini(&zc); + return (error); + } } - - *count -= 1; + zap_cursor_fini(&head_ds_cursor); } - zap_cursor_fini(&zc); - return (0); } static int -process_error_list(avl_tree_t *list, void *addr, size_t *count) +process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count) { spa_error_entry_t *se; - for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + for (se = avl_first(list); se != NULL; + se = AVL_NEXT(list, se)) { - if (*count == 0) - return (SET_ERROR(ENOMEM)); + if (*count == 0) + return (SET_ERROR(ENOMEM)); - if (copyout(&se->se_bookmark, (char *)addr + - (*count - 1) * sizeof (zbookmark_phys_t), - sizeof (zbookmark_phys_t)) != 0) - return (SET_ERROR(EFAULT)); + if (copyout(&se->se_bookmark, (char *)uaddr + + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) + return (SET_ERROR(EFAULT)); - *count -= 1; + *count -= 1; + } + return (0); } + for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + zbookmark_err_phys_t zep; + zep.zb_object = se->se_bookmark.zb_object; + zep.zb_level = se->se_bookmark.zb_level; + zep.zb_blkid = se->se_bookmark.zb_blkid; + + uint64_t head_ds_obj; + int error = get_head_and_birth_txg(spa, &zep, + se->se_bookmark.zb_objset, &head_ds_obj); + if (error != 0) + return (error); + + error = process_error_block(spa, head_ds_obj, &zep, count, + uaddr, B_FALSE); + if (error != 0) + return (error); + } return (0); } #endif @@ -229,7 +813,7 @@ process_error_list(avl_tree_t *list, void *addr, size_t *count) * the error list lock when we are finished. */ int -spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) +spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count) { int ret = 0; @@ -244,10 +828,10 @@ spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) mutex_enter(&spa->spa_errlist_lock); if (!ret) - ret = process_error_list(&spa->spa_errlist_scrub, uaddr, + ret = process_error_list(spa, &spa->spa_errlist_scrub, uaddr, count); if (!ret) - ret = process_error_list(&spa->spa_errlist_last, uaddr, + ret = process_error_list(spa, &spa->spa_errlist_last, uaddr, count); mutex_exit(&spa->spa_errlist_lock); @@ -299,35 +883,91 @@ spa_errlog_drain(spa_t *spa) /* * Process a list of errors into the current on-disk log. */ -static void +void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) { spa_error_entry_t *se; char buf[64]; void *cookie; - if (avl_numnodes(t) != 0) { - /* create log if necessary */ - if (*obj == 0) - *obj = zap_create(spa->spa_meta_objset, - DMU_OT_ERROR_LOG, DMU_OT_NONE, - 0, tx); + if (avl_numnodes(t) == 0) + return; + + /* create log if necessary */ + if (*obj == 0) + *obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, + DMU_OT_NONE, 0, tx); - /* add errors to the current log */ + /* add errors to the current log */ + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { char *name = se->se_name ? se->se_name : ""; bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); + (void) zap_update(spa->spa_meta_objset, *obj, buf, 1, + strlen(name) + 1, name, tx); + } + } else { + for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { + char *name = se->se_name ? se->se_name : ""; + + zbookmark_err_phys_t zep; + zep.zb_object = se->se_bookmark.zb_object; + zep.zb_level = se->se_bookmark.zb_level; + zep.zb_blkid = se->se_bookmark.zb_blkid; + + /* + * If we cannot find out the head dataset and birth txg + * of the present error block, we simply continue. + * Reinserting that error block to the error lists, + * even if we are not syncing the final txg, results + * in duplicate posting of errors. + */ + uint64_t head_dataset_obj; + int error = get_head_and_birth_txg(spa, &zep, + se->se_bookmark.zb_objset, &head_dataset_obj); + if (error != 0) + continue; + + uint64_t err_obj; + error = zap_lookup_int_key(spa->spa_meta_objset, + *obj, head_dataset_obj, &err_obj); + + if (error == ENOENT) { + err_obj = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); + + (void) zap_update_int_key(spa->spa_meta_objset, + *obj, head_dataset_obj, err_obj, tx); + } + errphys_to_name(&zep, buf, sizeof (buf)); + (void) zap_update(spa->spa_meta_objset, - *obj, buf, 1, strlen(name) + 1, name, tx); + err_obj, buf, 1, strlen(name) + 1, name, tx); } + } + /* purge the error list */ + cookie = NULL; + while ((se = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); +} - /* purge the error list */ - cookie = NULL; - while ((se = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(se, sizeof (spa_error_entry_t)); +static void +delete_errlog(spa_t *spa, uint64_t spa_err_obj, dmu_tx_t *tx) +{ + if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + VERIFY0(dmu_object_free(spa->spa_meta_objset, + za.za_first_integer, tx)); + } + zap_cursor_fini(&zc); } + VERIFY0(dmu_object_free(spa->spa_meta_objset, spa_err_obj, tx)); } /* @@ -378,8 +1018,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) */ if (scrub_finished) { if (spa->spa_errlog_last != 0) - VERIFY(dmu_object_free(spa->spa_meta_objset, - spa->spa_errlog_last, tx) == 0); + delete_errlog(spa, spa->spa_errlog_last, tx); spa->spa_errlog_last = spa->spa_errlog_scrub; spa->spa_errlog_scrub = 0; @@ -406,6 +1045,137 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) mutex_exit(&spa->spa_errlog_lock); } +static void +delete_dataset_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t ds, + dmu_tx_t *tx) +{ + if (spa_err_obj == 0) + return; + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + uint64_t head_ds; + name_to_object(za.za_name, &head_ds); + if (head_ds == ds) { + (void) zap_remove(spa->spa_meta_objset, spa_err_obj, + za.za_name, tx); + VERIFY0(dmu_object_free(spa->spa_meta_objset, + za.za_first_integer, tx)); + break; + } + } + zap_cursor_fini(&zc); +} + +void +spa_delete_dataset_errlog(spa_t *spa, uint64_t ds, dmu_tx_t *tx) +{ + mutex_enter(&spa->spa_errlog_lock); + delete_dataset_errlog(spa, spa->spa_errlog_scrub, ds, tx); + delete_dataset_errlog(spa, spa->spa_errlog_last, ds, tx); + mutex_exit(&spa->spa_errlog_lock); +} + +static int +find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, + uint64_t *txg) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + + int error = dsl_dataset_hold_obj(dp, old_head, FTAG, &ds); + if (error != 0) + return (error); + + uint64_t prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + + while (prev_obj != 0) { + dsl_dataset_rele(ds, FTAG); + if ((error = dsl_dataset_hold_obj(dp, prev_obj, + FTAG, &ds)) == 0 && + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head) + break; + + if (error != 0) + return (error); + + prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + } + dsl_dataset_rele(ds, FTAG); + ASSERT(prev_obj != 0); + *txg = prev_obj_txg; + return (0); +} + +static void +swap_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t new_head, uint64_t + old_head, dmu_tx_t *tx) +{ + if (spa_err_obj == 0) + return; + + uint64_t old_head_errlog; + int error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, + old_head, &old_head_errlog); + + /* If no error log, then there is nothing to do. */ + if (error != 0) + return; + + uint64_t txg; + error = find_txg_ancestor_snapshot(spa, new_head, old_head, &txg); + if (error != 0) + return; + + /* + * Create an error log if the file system being promoted does not + * already have one. + */ + uint64_t new_head_errlog; + error = zap_lookup_int_key(spa->spa_meta_objset, spa_err_obj, new_head, + &new_head_errlog); + + if (error != 0) { + new_head_errlog = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); + + (void) zap_update_int_key(spa->spa_meta_objset, spa_err_obj, + new_head, new_head_errlog, tx); + } + + zap_cursor_t zc; + zap_attribute_t za; + zbookmark_err_phys_t err_block; + for (zap_cursor_init(&zc, spa->spa_meta_objset, old_head_errlog); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + + char *name = ""; + name_to_errphys(za.za_name, &err_block); + if (err_block.zb_birth < txg) { + (void) zap_update(spa->spa_meta_objset, new_head_errlog, + za.za_name, 1, strlen(name) + 1, name, tx); + + (void) zap_remove(spa->spa_meta_objset, old_head_errlog, + za.za_name, tx); + } + } + zap_cursor_fini(&zc); +} + +void +spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, + dmu_tx_t *tx) +{ + mutex_enter(&spa->spa_errlog_lock); + swap_errlog(spa, spa->spa_errlog_scrub, new_head_ds, old_head_ds, tx); + swap_errlog(spa, spa->spa_errlog_last, new_head_ds, old_head_ds, tx); + mutex_exit(&spa->spa_errlog_lock); +} + #if defined(_KERNEL) /* error handling */ EXPORT_SYMBOL(spa_log_error); @@ -415,4 +1185,14 @@ EXPORT_SYMBOL(spa_errlog_rotate); EXPORT_SYMBOL(spa_errlog_drain); EXPORT_SYMBOL(spa_errlog_sync); EXPORT_SYMBOL(spa_get_errlists); +EXPORT_SYMBOL(spa_delete_dataset_errlog); +EXPORT_SYMBOL(spa_swap_errlog); +EXPORT_SYMBOL(sync_error_list); +EXPORT_SYMBOL(spa_upgrade_errlog); #endif + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_spa, spa_, upgrade_errlog_limit, INT, ZMOD_RW, + "Limit the number of errors which will be upgraded to the new " + "on-disk error log when enabling head_errlog"); +/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c index 110a4eab99f9..f831509a4247 100644 --- a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c +++ b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c @@ -257,7 +257,12 @@ static unsigned long zfs_unflushed_log_block_min = 1000; * terms of performance. Thus we have a hard limit in the size of the log in * terms of blocks. */ -static unsigned long zfs_unflushed_log_block_max = (1ULL << 18); +static unsigned long zfs_unflushed_log_block_max = (1ULL << 17); + +/* + * Also we have a hard limit in the size of the log in terms of dirty TXGs. + */ +static unsigned long zfs_unflushed_log_txg_max = 1000; /* * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and @@ -333,9 +338,13 @@ spa_log_sm_set_blocklimit(spa_t *spa) return; } - uint64_t calculated_limit = - (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100; - spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit, + uint64_t msdcount = 0; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e; e = list_next(&spa->spa_log_summary, e)) + msdcount += e->lse_msdcount; + + uint64_t limit = msdcount * zfs_unflushed_log_block_pct / 100; + spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(limit, zfs_unflushed_log_block_min), zfs_unflushed_log_block_max); } @@ -380,8 +389,13 @@ spa_log_summary_verify_counts(spa_t *spa) } static boolean_t -summary_entry_is_full(spa_t *spa, log_summary_entry_t *e) +summary_entry_is_full(spa_t *spa, log_summary_entry_t *e, uint64_t txg) { + if (e->lse_end == txg) + return (0); + if (e->lse_txgcount >= DIV_ROUND_UP(zfs_unflushed_log_txg_max, + zfs_max_logsm_summary_length)) + return (1); uint64_t blocks_per_row = MAX(1, DIV_ROUND_UP(spa_log_sm_blocklimit(spa), zfs_max_logsm_summary_length)); @@ -401,7 +415,7 @@ summary_entry_is_full(spa_t *spa, log_summary_entry_t *e) * the metaslab. */ void -spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) +spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg, boolean_t dirty) { /* * We don't track summary data for read-only pools and this function @@ -429,6 +443,8 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) } target->lse_mscount--; + if (dirty) + target->lse_msdcount--; } /* @@ -490,8 +506,10 @@ spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) void spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone) { - for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); - e != NULL; e = list_head(&spa->spa_log_summary)) { + log_summary_entry_t *e = list_head(&spa->spa_log_summary); + if (e->lse_txgcount > 0) + e->lse_txgcount--; + for (; e != NULL; e = list_head(&spa->spa_log_summary)) { if (e->lse_blkcount > blocks_gone) { /* * Assert that we stopped at an entry that is not @@ -560,31 +578,52 @@ spa_log_sm_increment_current_mscount(spa_t *spa) static void summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed, - uint64_t nblocks) + uint64_t metaslabs_dirty, uint64_t nblocks) { log_summary_entry_t *e = list_tail(&spa->spa_log_summary); - if (e == NULL || summary_entry_is_full(spa, e)) { + if (e == NULL || summary_entry_is_full(spa, e, txg)) { e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP); - e->lse_start = txg; + e->lse_start = e->lse_end = txg; + e->lse_txgcount = 1; list_insert_tail(&spa->spa_log_summary, e); } ASSERT3U(e->lse_start, <=, txg); + if (e->lse_end < txg) { + e->lse_end = txg; + e->lse_txgcount++; + } e->lse_mscount += metaslabs_flushed; + e->lse_msdcount += metaslabs_dirty; e->lse_blkcount += nblocks; } static void spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks) { - summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks); + summary_add_data(spa, spa_syncing_txg(spa), 0, 0, nblocks); } void -spa_log_summary_add_flushed_metaslab(spa_t *spa) +spa_log_summary_add_flushed_metaslab(spa_t *spa, boolean_t dirty) { - summary_add_data(spa, spa_syncing_txg(spa), 1, 0); + summary_add_data(spa, spa_syncing_txg(spa), 1, dirty ? 1 : 0, 0); +} + +void +spa_log_summary_dirty_flushed_metaslab(spa_t *spa, uint64_t txg) +{ + log_summary_entry_t *target = NULL; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e != NULL; e = list_next(&spa->spa_log_summary, e)) { + if (e->lse_start > txg) + break; + target = e; + } + ASSERT3P(target, !=, NULL); + ASSERT3U(target->lse_mscount, !=, 0); + target->lse_msdcount++; } /* @@ -630,6 +669,11 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) int64_t available_blocks = spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming; + int64_t available_txgs = zfs_unflushed_log_txg_max; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e; e = list_next(&spa->spa_log_summary, e)) + available_txgs -= e->lse_txgcount; + /* * This variable tells us the total number of flushes needed to * keep the log size within the limit when we reach txgs_in_future. @@ -637,9 +681,7 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) uint64_t total_flushes = 0; /* Holds the current maximum of our estimates so far. */ - uint64_t max_flushes_pertxg = - MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed), - zfs_min_metaslabs_to_flush); + uint64_t max_flushes_pertxg = zfs_min_metaslabs_to_flush; /* * For our estimations we only look as far in the future @@ -653,11 +695,14 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) * then keep skipping TXGs accumulating more blocks * based on the incoming rate until we exceed it. */ - if (available_blocks >= 0) { - uint64_t skip_txgs = (available_blocks / incoming) + 1; + if (available_blocks >= 0 && available_txgs >= 0) { + uint64_t skip_txgs = MIN(available_txgs + 1, + (available_blocks / incoming) + 1); available_blocks -= (skip_txgs * incoming); + available_txgs -= skip_txgs; txgs_in_future += skip_txgs; ASSERT3S(available_blocks, >=, -incoming); + ASSERT3S(available_txgs, >=, -1); } /* @@ -666,9 +711,10 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) * based on the current entry in the summary, updating * our available_blocks. */ - ASSERT3S(available_blocks, <, 0); + ASSERT(available_blocks < 0 || available_txgs < 0); available_blocks += e->lse_blkcount; - total_flushes += e->lse_mscount; + available_txgs += e->lse_txgcount; + total_flushes += e->lse_msdcount; /* * Keep the running maximum of the total_flushes that @@ -680,8 +726,6 @@ spa_estimate_metaslabs_to_flush(spa_t *spa) */ max_flushes_pertxg = MAX(max_flushes_pertxg, DIV_ROUND_UP(total_flushes, txgs_in_future)); - ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, - max_flushes_pertxg); } return (max_flushes_pertxg); } @@ -771,14 +815,11 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) uint64_t want_to_flush; if (spa_flush_all_logs_requested(spa)) { ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); - want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed); + want_to_flush = UINT64_MAX; } else { want_to_flush = spa_estimate_metaslabs_to_flush(spa); } - ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, - want_to_flush); - /* Used purely for verification purposes */ uint64_t visited = 0; @@ -809,31 +850,22 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa)) break; - mutex_enter(&curr->ms_sync_lock); - mutex_enter(&curr->ms_lock); - boolean_t flushed = metaslab_flush(curr, tx); - mutex_exit(&curr->ms_lock); - mutex_exit(&curr->ms_sync_lock); - - /* - * If we failed to flush a metaslab (because it was loading), - * then we are done with the block heuristic as it's not - * possible to destroy any log space maps once you've skipped - * a metaslab. In that case we just set our counter to 0 but - * we continue looping in case there is still memory pressure - * due to unflushed changes. Note that, flushing a metaslab - * that is not the oldest flushed in the pool, will never - * destroy any log space maps [see spa_cleanup_old_sm_logs()]. - */ - if (!flushed) { - want_to_flush = 0; - } else if (want_to_flush > 0) { - want_to_flush--; - } + if (metaslab_unflushed_dirty(curr)) { + mutex_enter(&curr->ms_sync_lock); + mutex_enter(&curr->ms_lock); + metaslab_flush(curr, tx); + mutex_exit(&curr->ms_lock); + mutex_exit(&curr->ms_sync_lock); + if (want_to_flush > 0) + want_to_flush--; + } else + metaslab_unflushed_bump(curr, tx, B_FALSE); visited++; } ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited); + + spa_log_sm_set_blocklimit(spa); } /* @@ -904,6 +936,7 @@ spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx) avl_remove(&spa->spa_sm_logs_by_txg, sls); space_map_free_obj(mos, sls->sls_sm_obj, tx); VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx)); + spa_log_summary_decrement_blkcount(spa, sls->sls_nblocks); spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks; kmem_free(sls, sizeof (spa_log_sm_t)); } @@ -963,12 +996,7 @@ spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx) VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); - /* - * If the log space map feature was just enabled, the blocklimit - * has not yet been set. - */ - if (spa_log_sm_blocklimit(spa) == 0) - spa_log_sm_set_blocklimit(spa); + spa_log_sm_set_blocklimit(spa); } /* @@ -1094,12 +1122,18 @@ spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg) panic("invalid maptype_t"); break; } + if (!metaslab_unflushed_dirty(ms)) { + metaslab_set_unflushed_dirty(ms, B_TRUE); + spa_log_summary_dirty_flushed_metaslab(spa, + metaslab_unflushed_txg(ms)); + } return (0); } static int spa_ld_log_sm_data(spa_t *spa) { + spa_log_sm_t *sls, *psls; int error = 0; /* @@ -1113,41 +1147,71 @@ spa_ld_log_sm_data(spa_t *spa) ASSERT0(spa->spa_unflushed_stats.sus_memused); hrtime_t read_logs_starttime = gethrtime(); - /* this is a no-op when we don't have space map logs */ - for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); - sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { - space_map_t *sm = NULL; - error = space_map_open(&sm, spa_meta_objset(spa), - sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT); - if (error != 0) { - spa_load_failed(spa, "spa_ld_log_sm_data(): failed at " - "space_map_open(obj=%llu) [error %d]", - (u_longlong_t)sls->sls_sm_obj, error); - goto out; + + /* Prefetch log spacemaps dnodes. */ + for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls; + sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj, + 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + } + + uint_t pn = 0; + uint64_t ps = 0; + psls = sls = avl_first(&spa->spa_sm_logs_by_txg); + while (sls != NULL) { + /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */ + if (psls != NULL && pn < 16 && + (pn < 2 || ps < 2 * dmu_prefetch_max)) { + error = space_map_open(&psls->sls_sm, + spa_meta_objset(spa), psls->sls_sm_obj, 0, + UINT64_MAX, SPA_MINBLOCKSHIFT); + if (error != 0) { + spa_load_failed(spa, "spa_ld_log_sm_data(): " + "failed at space_map_open(obj=%llu) " + "[error %d]", + (u_longlong_t)sls->sls_sm_obj, error); + goto out; + } + dmu_prefetch(spa_meta_objset(spa), psls->sls_sm_obj, + 0, 0, space_map_length(psls->sls_sm), + ZIO_PRIORITY_ASYNC_READ); + pn++; + ps += space_map_length(psls->sls_sm); + psls = AVL_NEXT(&spa->spa_sm_logs_by_txg, psls); + continue; } + /* Load TXG log spacemap into ms_unflushed_allocs/frees. */ + cond_resched(); + ASSERT0(sls->sls_nblocks); + sls->sls_nblocks = space_map_nblocks(sls->sls_sm); + spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; + summary_add_data(spa, sls->sls_txg, + sls->sls_mscount, 0, sls->sls_nblocks); + struct spa_ld_log_sm_arg vla = { .slls_spa = spa, .slls_txg = sls->sls_txg }; - error = space_map_iterate(sm, space_map_length(sm), - spa_ld_log_sm_cb, &vla); + error = space_map_iterate(sls->sls_sm, + space_map_length(sls->sls_sm), spa_ld_log_sm_cb, &vla); if (error != 0) { - space_map_close(sm); spa_load_failed(spa, "spa_ld_log_sm_data(): failed " "at space_map_iterate(obj=%llu) [error %d]", (u_longlong_t)sls->sls_sm_obj, error); goto out; } - ASSERT0(sls->sls_nblocks); - sls->sls_nblocks = space_map_nblocks(sm); - spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; - summary_add_data(spa, sls->sls_txg, - sls->sls_mscount, sls->sls_nblocks); + pn--; + ps -= space_map_length(sls->sls_sm); + space_map_close(sls->sls_sm); + sls->sls_sm = NULL; + sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls); - space_map_close(sm); + /* Update log block limits considering just loaded. */ + spa_log_sm_set_blocklimit(spa); } + hrtime_t read_logs_endtime = gethrtime(); spa_load_note(spa, "read %llu log space maps (%llu total blocks - blksz = %llu bytes) " @@ -1157,6 +1221,18 @@ spa_ld_log_sm_data(spa_t *spa) (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000)); out: + if (error != 0) { + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + if (sls->sls_sm) { + space_map_close(sls->sls_sm); + sls->sls_sm = NULL; + } + } + } else { + ASSERT0(pn); + ASSERT0(ps); + } /* * Now that the metaslabs contain their unflushed changes: * [1] recalculate their actual allocated space @@ -1237,6 +1313,9 @@ spa_ld_unflushed_txgs(vdev_t *vd) } ms->ms_unflushed_txg = entry.msp_unflushed_txg; + ms->ms_unflushed_dirty = B_FALSE; + ASSERT(range_tree_is_empty(ms->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(ms->ms_unflushed_frees)); if (ms->ms_unflushed_txg != 0) { mutex_enter(&spa->spa_flushed_ms_lock); avl_add(&spa->spa_metaslabs_by_flushed, ms); @@ -1300,6 +1379,10 @@ ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW, "Lower-bound limit for the maximum amount of blocks allowed in " "log spacemap (see zfs_unflushed_log_block_max)"); +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_txg_max, ULONG, ZMOD_RW, + "Hard limit (upper-bound) in the size of the space map log " + "in terms of dirty TXGs."); + ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW, "Tunable used to determine the number of blocks that can be used for " "the spacemap log, expressed as a percentage of the total number of " diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index db2d2c5e44fb..ce7f020a0d86 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -1523,13 +1523,6 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (txg == 0) spa_config_exit(spa, SCL_ALLOC, FTAG); - /* - * Regardless whether this vdev was just added or it is being - * expanded, the metaslab count has changed. Recalculate the - * block limit. - */ - spa_log_sm_set_blocklimit(spa); - return (0); } diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c index 17f9d6c90804..5508d273758d 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_removal.c +++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c @@ -1386,7 +1386,6 @@ vdev_remove_complete(spa_t *spa) vdev_metaslab_fini(vd); metaslab_group_destroy(vd->vdev_mg); vd->vdev_mg = NULL; - spa_log_sm_set_blocklimit(spa); } if (vd->vdev_log_mg != NULL) { ASSERT0(vd->vdev_ms_count); @@ -2131,7 +2130,6 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) * metaslab_class_histogram_verify() */ vdev_metaslab_fini(vd); - spa_log_sm_set_blocklimit(spa); spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); *txg = spa_vdev_config_enter(spa); @@ -2251,7 +2249,6 @@ spa_vdev_remove_top_check(vdev_t *vd) * and not be raidz or draid. */ vdev_t *rvd = spa->spa_root_vdev; - int num_indirect = 0; for (uint64_t id = 0; id < rvd->vdev_children; id++) { vdev_t *cvd = rvd->vdev_child[id]; @@ -2267,8 +2264,6 @@ spa_vdev_remove_top_check(vdev_t *vd) if (cvd->vdev_ashift != 0 && cvd->vdev_alloc_bias == VDEV_BIAS_NONE) ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); - if (cvd->vdev_ops == &vdev_indirect_ops) - num_indirect++; if (!vdev_is_concrete(cvd)) continue; if (vdev_get_nparity(cvd) != 0) diff --git a/sys/contrib/openzfs/module/zfs/zfeature.c b/sys/contrib/openzfs/module/zfs/zfeature.c index 9d16fff81d0a..fc9167aa6611 100644 --- a/sys/contrib/openzfs/module/zfs/zfeature.c +++ b/sys/contrib/openzfs/module/zfs/zfeature.c @@ -389,6 +389,13 @@ feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) !spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION) && feature->fi_feature == SPA_FEATURE_BOOKMARK_V2) spa->spa_errata = 0; + + /* + * Convert the old on-disk error log to the new format when activating + * the head_errlog feature. + */ + if (feature->fi_feature == SPA_FEATURE_HEAD_ERRLOG) + spa_upgrade_errlog(spa, tx); } static void diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index a2824c5cc804..b3f32d64f3ef 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -5670,7 +5670,7 @@ zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; - size_t count = (size_t)zc->zc_nvlist_dst_size; + uint64_t count = zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c index 62806e9fe8b1..a039b4da2833 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -68,7 +68,9 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + atomic_inc_32(&zp->z_sync_writes_cnt); zil_commit(zfsvfs->z_log, zp->z_id); + atomic_dec_32(&zp->z_sync_writes_cnt); ZFS_EXIT(zfsvfs); } tsd_set(zfs_fsyncer_key, NULL); @@ -357,11 +359,11 @@ zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr, if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) { vattr_t va = {0}; - va.va_mask = AT_MODE; + va.va_mask = ATTR_MODE; va.va_nodeid = zp->z_id; va.va_mode = newmode; - zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, AT_MODE, - NULL); + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, + ATTR_MODE, NULL); *clear_setid_bits_txgp = dmu_tx_get_txg(tx); } } else { diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index f6adea572418..2a16d5cef2e2 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -166,15 +166,6 @@ zio_init(void) cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; -#if defined(_ILP32) && defined(_KERNEL) - /* - * Cache size limited to 1M on 32-bit platforms until ARC - * buffers no longer require virtual address space. - */ - if (size > zfs_max_recordsize) - break; -#endif - while (!ISP2(p2)) p2 &= p2 - 1; diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index eb68b05c567b..ac7c3a0c3232 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -513,6 +513,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_MKDIR_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ zvol_replay_err, /* TX_WRITE2 */ + zvol_replay_err, /* TX_SETSAXATTR */ }; /* diff --git a/sys/contrib/openzfs/module/zstd/Makefile.in b/sys/contrib/openzfs/module/zstd/Makefile.in deleted file mode 100644 index 80096c3e379d..000000000000 --- a/sys/contrib/openzfs/module/zstd/Makefile.in +++ /dev/null @@ -1,69 +0,0 @@ -ifneq ($(KBUILD_EXTMOD),) -src = @abs_srcdir@ -obj = @abs_builddir@ -zstd_include = $(src)/include -else -zstd_include = $(srctree)/$(src)/include -endif - -MODULE := zzstd - -obj-$(CONFIG_ZFS) := $(MODULE).o - -asflags-y := -I$(zstd_include) -ccflags-y := -I$(zstd_include) - -# Zstd uses -O3 by default, so we should follow -ccflags-y += -O3 - -# -fno-tree-vectorize gets set for gcc in zstd/common/compiler.h -# Set it for other compilers, too. -common_flags := -fno-tree-vectorize - -# SSE register return with SSE disabled if -march=znverX is passed -common_flags += -U__BMI__ - -# Quiet warnings about frame size due to unused code in unmodified zstd lib -common_flags += -Wframe-larger-than=20480 - -ccflags-y += $(common_flags) - -vanilla-objs := lib/common/entropy_common.o \ - lib/common/error_private.o \ - lib/common/fse_decompress.o \ - lib/common/pool.o \ - lib/common/zstd_common.o \ - lib/compress/fse_compress.o \ - lib/compress/hist.o \ - lib/compress/huf_compress.o \ - lib/compress/zstd_compress_literals.o \ - lib/compress/zstd_compress_sequences.o \ - lib/compress/zstd_compress_superblock.o \ - lib/compress/zstd_compress.o \ - lib/compress/zstd_double_fast.o \ - lib/compress/zstd_fast.o \ - lib/compress/zstd_lazy.o \ - lib/compress/zstd_ldm.o \ - lib/compress/zstd_opt.o \ - lib/decompress/huf_decompress.o \ - lib/decompress/zstd_ddict.o \ - lib/decompress/zstd_decompress.o \ - lib/decompress/zstd_decompress_block.o - -# Disable aarch64 neon SIMD instructions for kernel mode -$(addprefix $(obj)/,$(vanilla-objs)) : ccflags-y += -include $(zstd_include)/aarch64_compat.h -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w $(common_flags) - -$(obj)/zfs_zstd.o: ccflags-y += -include $(zstd_include)/zstd_compat_wrapper.h $(common_flags) - -$(MODULE)-objs += zfs_zstd.o -$(MODULE)-objs += zstd_sparc.o -$(MODULE)-objs += $(vanilla-objs) - -all: - mkdir -p lib/common lib/compress lib/decompress - -gensymbols: - for obj in $(vanilla-objs); do echo; echo "/* $$obj: */"; @OBJDUMP@ -t $$obj | awk '$$2 == "g" && !/ zfs_/ {print "#define\t" $$6 " zfs_" $$6}' | sort; done >> include/zstd_compat_wrapper.h - -checksymbols: - @OBJDUMP@ -t $(vanilla-objs) | awk '/file format/ {print} $$2 == "g" && !/ zfs_/ {++ret; print} END {exit ret}' diff --git a/sys/contrib/openzfs/module/zstd/README.md b/sys/contrib/openzfs/module/zstd/README.md index 26d618b61b6e..7ad00e0bd804 100644 --- a/sys/contrib/openzfs/module/zstd/README.md +++ b/sys/contrib/openzfs/module/zstd/README.md @@ -9,7 +9,7 @@ library, besides upgrading to a newer ZSTD release. Tree structure: -* `zfs_zstd.c` is the actual `zzstd` kernel module. +* `zfs_zstd.c` are the actual `zfs` kernel module hooks. * `lib/` contains the unmodified version of the `Zstandard` library * `zstd-in.c` is our template file for generating the single-file library * `include/`: This directory contains supplemental includes for platform @@ -25,16 +25,7 @@ To update ZSTD the following steps need to be taken: `grep include [path to zstd]/contrib/single_file_libs/zstd-in.c | awk '{ print $2 }'` 3. Remove debug.c, threading.c, and zstdmt_compress.c. 4. Update Makefiles with resulting file lists. - -~~~ - -Note: if the zstd library for zfs is updated to a newer version, -the macro list in include/zstd_compat_wrapper.h usually needs to be updated. -this can be done with some hand crafting of the output of the following -script (on the object file generated from the "single-file library" script in zstd's -contrib/single_file_libs): -`nm zstd.o | awk '{print "#define "$3 " zfs_" $3}' > macrotable` - +5. Follow symbol renaming notes in `include/zstd_compat_wrapper.h` ## Altering ZSTD and breaking changes diff --git a/sys/contrib/openzfs/module/zstd/include/zstd_compat_wrapper.h b/sys/contrib/openzfs/module/zstd/include/zstd_compat_wrapper.h index de428175c7df..2c4baad27d4e 100644 --- a/sys/contrib/openzfs/module/zstd/include/zstd_compat_wrapper.h +++ b/sys/contrib/openzfs/module/zstd/include/zstd_compat_wrapper.h @@ -38,7 +38,7 @@ * This will cause a symbol collision with the older in-kernel zstd library. * * On update, truncate this file at the scissor line, rebuild the module, - * and make gensymbols. + * and make gen-zstd-symbols. */ #define MEM_MODULE diff --git a/sys/contrib/openzfs/module/zstd/lib/compress/fse_compress.c b/sys/contrib/openzfs/module/zstd/lib/compress/fse_compress.c index a42759814fdd..e27414ccbbcd 100644 --- a/sys/contrib/openzfs/module/zstd/lib/compress/fse_compress.c +++ b/sys/contrib/openzfs/module/zstd/lib/compress/fse_compress.c @@ -304,7 +304,7 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog) { - size_t size; + size_t size __attribute__ ((unused)); if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32); return (FSE_CTable*)malloc(size); diff --git a/sys/contrib/openzfs/module/zstd/lib/compress/zstd_compress_superblock.c b/sys/contrib/openzfs/module/zstd/lib/compress/zstd_compress_superblock.c index b693866c0ac1..ffa4bb67597f 100644 --- a/sys/contrib/openzfs/module/zstd/lib/compress/zstd_compress_superblock.c +++ b/sys/contrib/openzfs/module/zstd/lib/compress/zstd_compress_superblock.c @@ -409,7 +409,7 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* const seqDef* const send = sequences + nbSeq; const seqDef* sp = sstart; size_t matchLengthSum = 0; - size_t litLengthSum = 0; + size_t litLengthSum __attribute__ ((unused)) = 0; while (send-sp > 0) { ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp); litLengthSum += seqLen.litLength; diff --git a/sys/contrib/openzfs/module/zstd/zfs_zstd.c b/sys/contrib/openzfs/module/zstd/zfs_zstd.c index 7f042b5bcd6f..2ccc6818754e 100644 --- a/sys/contrib/openzfs/module/zstd/zfs_zstd.c +++ b/sys/contrib/openzfs/module/zstd/zfs_zstd.c @@ -50,7 +50,7 @@ #include "lib/zstd.h" #include "lib/common/zstd_errors.h" -kstat_t *zstd_ksp = NULL; +static kstat_t *zstd_ksp = NULL; typedef struct zstd_stats { kstat_named_t zstd_stat_alloc_fail; @@ -702,7 +702,7 @@ zstd_meminit(void) } /* Release object from pool and free memory */ -static void __exit +static void release_pool(struct zstd_pool *pool) { mutex_destroy(&pool->barrier); @@ -712,7 +712,7 @@ release_pool(struct zstd_pool *pool) } /* Release memory pool objects */ -static void __exit +static void zstd_mempool_deinit(void) { for (int i = 0; i < ZSTD_POOL_MAX; i++) { @@ -765,7 +765,7 @@ zstd_init(void) return (0); } -extern void __exit +extern void zstd_fini(void) { /* Deinitialize kstat */ @@ -783,12 +783,10 @@ zstd_fini(void) } #if defined(_KERNEL) +#ifdef __FreeBSD__ module_init(zstd_init); module_exit(zstd_fini); - -ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS"); -ZFS_MODULE_LICENSE("Dual BSD/GPL"); -ZFS_MODULE_VERSION(ZSTD_VERSION_STRING "a"); +#endif EXPORT_SYMBOL(zfs_zstd_compress); EXPORT_SYMBOL(zfs_zstd_decompress_level); |